In [8]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Input text
text = 'Real madrid is set to win the UCL for the season. Benzema might win Balon dor. Salah might be the runner up'

# ----------------------------
# Tokenization
# ----------------------------
# Sentence Tokenization
tokens_sents = sent_tokenize(text)
print("Sentence Tokens:\n", tokens_sents)

# Word Tokenization
tokens_words = word_tokenize(text)
print("\nWord Tokens:\n", tokens_words)

# ----------------------------
# Stemming
# ----------------------------
ps = PorterStemmer()
stem = [ps.stem(word) for word in tokens_words]
print("\nStemmed Words:\n", stem)

# ----------------------------
# Lemmatization
# ----------------------------
lemmatizer = WordNetLemmatizer()
leme = [lemmatizer.lemmatize(word) for word in stem]
print("\nLemmatized Words:\n", leme)

# ----------------------------
# POS Tagging
# ----------------------------
pos_tags = nltk.pos_tag(tokens_words)
print("\nParts of Speech:\n", pos_tags)

# ----------------------------
# Stop Word Removal
# ----------------------------
sw_nltk = stopwords.words('english')
filtered_words = [word for word in tokens_words if word.lower() not in sw_nltk]
filtered_text = " ".join(filtered_words)
print("\nText without Stopwords:\n", filtered_text)

# ----------------------------
# TF-IDF Vectorization
# ----------------------------
corpus = [filtered_text]

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

print("\nTF-IDF Features:")
for word, score in zip(tfidf_vectorizer.get_feature_names_out(), X_tfidf.toarray()[0]):
    print(f"{word}: {score:.4f}")

# ----------------------------
# Bag of Words (Count Vectorization)
# ----------------------------
count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(corpus)

print("\nBag of Words (Word Counts):")
for word, count in zip(count_vectorizer.get_feature_names_out(), X_bow.toarray()[0]):
    print(f"{word}: {count}")

Sentence Tokens:
 ['Real madrid is set to win the UCL for the season.', 'Benzema might win Balon dor.', 'Salah might be the runner up']

Word Tokens:
 ['Real', 'madrid', 'is', 'set', 'to', 'win', 'the', 'UCL', 'for', 'the', 'season', '.', 'Benzema', 'might', 'win', 'Balon', 'dor', '.', 'Salah', 'might', 'be', 'the', 'runner', 'up']

Stemmed Words:
 ['real', 'madrid', 'is', 'set', 'to', 'win', 'the', 'ucl', 'for', 'the', 'season', '.', 'benzema', 'might', 'win', 'balon', 'dor', '.', 'salah', 'might', 'be', 'the', 'runner', 'up']

Lemmatized Words:
 ['real', 'madrid', 'is', 'set', 'to', 'win', 'the', 'ucl', 'for', 'the', 'season', '.', 'benzema', 'might', 'win', 'balon', 'dor', '.', 'salah', 'might', 'be', 'the', 'runner', 'up']

Parts of Speech:
 [('Real', 'JJ'), ('madrid', 'NN'), ('is', 'VBZ'), ('set', 'VBN'), ('to', 'TO'), ('win', 'VB'), ('the', 'DT'), ('UCL', 'NNP'), ('for', 'IN'), ('the', 'DT'), ('season', 'NN'), ('.', '.'), ('Benzema', 'NNP'), ('might', 'MD'), ('win', 'VB'), ('Balo

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
