In [13]:
!pip install nltk scikit-learn -q 

# NLP Pipeline with NLTK 

In [14]:
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Extended sample text
text = """
Natural Language Processing (NLP) is a fascinating field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human language in a way that is both valuable and meaningful. 

NLP combines computational linguistics with machine learning and deep learning techniques to process and analyze large amounts of natural language data. Applications of NLP include sentiment analysis, language translation, chatbots, and information extraction. 

As technology continues to evolve, the capabilities of NLP systems are improving rapidly. For instance, recent advancements in transformer models, such as BERT and GPT, have significantly enhanced the performance of NLP tasks. These models are capable of understanding context and nuances in language, making them highly effective for various applications.

Despite the progress, challenges remain in the field of NLP. Ambiguities in language, cultural differences, and the vast diversity of dialects can complicate the development of robust NLP systems. Researchers are continually working to address these challenges and improve the accuracy and reliability of NLP technologies.
"""

print('Text:', text)
print('--------------------------------------')

# Step 1: Normalization (lowercase and remove punctuation)
normalized_text = text.lower()  # Convert to lowercase
normalized_text = ''.join(char for char in normalized_text if char.isalnum() or char.isspace())  # Remove punctuation

print('Remove punctuation:', text)
print('--------------------------------------')

# Step 2: Sentence Tokenization
sentences = sent_tokenize(normalized_text)
print("Sentences:", sentences)
print('--------------------------------------')

# Step 3: Word Tokenization for each sentence
tokens = [word_tokenize(sentence) for sentence in sentences]
print("Tokens:", tokens)
print("# of Tokens:", len(tokens[0]))
print('--------------------------------------')

# Step 4: Remove stopwords for each list of tokens
stop_words = set(stopwords.words('english'))
filtered_tokens = [[word for word in token_list if word not in stop_words] for token_list in tokens]
print("Filtered Tokens:", filtered_tokens)
print("# of Filtered Tokens:", len(filtered_tokens[0]))
print('--------------------------------------')

# Step 5: Stemming for each list of filtered tokens
ps = PorterStemmer()
stemmed_tokens = [[ps.stem(word) for word in filtered_list] for filtered_list in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)
print("# of Stemmed Tokens:", len(stemmed_tokens[0]))
print('--------------------------------------')

# Step 6: Feature Extraction using CountVectorizer
# Flatten the list of stemmed tokens for CountVectorizer
flattened_stemmed_tokens = [' '.join(tokens) for tokens in stemmed_tokens]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(flattened_stemmed_tokens)

# Display the feature names and the resulting feature matrix
print("Feature Names:", vectorizer.get_feature_names_out())
print("Feature Matrix:\n", X.toarray())


Text: 
Natural Language Processing (NLP) is a fascinating field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate objective of NLP is to enable computers to understand, interpret, and generate human language in a way that is both valuable and meaningful. 

NLP combines computational linguistics with machine learning and deep learning techniques to process and analyze large amounts of natural language data. Applications of NLP include sentiment analysis, language translation, chatbots, and information extraction. 

As technology continues to evolve, the capabilities of NLP systems are improving rapidly. For instance, recent advancements in transformer models, such as BERT and GPT, have significantly enhanced the performance of NLP tasks. These models are capable of understanding context and nuances in language, making them highly effective for various applications.

Despite the progress, challenges remain in th

[nltk_data] Downloading package punkt to C:\Users\Motaz
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Motaz
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Motaz
[nltk_data]     PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
