In [1]:
!pip install nltk



In [2]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag

In [4]:
text = """
There is often a fierce rivalry between the two strongest teams in a national league, and this is particularly the case in La Liga, where the game between Real Madrid and Barcelona is known as "The Classic" (El Clásico). From the start of national competitions the clubs were seen as representatives of two rival regions in Spain: Castile and Catalonia, as well as of the two cities. The rivalry reflects what many regard as the political and cultural tensions felt between the Castilians and Catalans, seen by one author as a re-enactment of the Spanish Civil War.[254] Over the years, the record for Real Madrid and Barcelona is 105 victories for Madrid, 104 victories for Barcelona, and 52 draws as of June 2025.
"""


# Sentence Tokenizer

In [5]:
sentences = sent_tokenize(text)
print("Sentences:")
for s in sentences:
    print("-", s)

Sentences:
- 
There is often a fierce rivalry between the two strongest teams in a national league, and this is particularly the case in La Liga, where the game between Real Madrid and Barcelona is known as "The Classic" (El Clásico).
- From the start of national competitions the clubs were seen as representatives of two rival regions in Spain: Castile and Catalonia, as well as of the two cities.
- The rivalry reflects what many regard as the political and cultural tensions felt between the Castilians and Catalans, seen by one author as a re-enactment of the Spanish Civil War.
- [254] Over the years, the record for Real Madrid and Barcelona is 105 victories for Madrid, 104 victories for Barcelona, and 52 draws as of June 2025.


# Word Tokenizer

In [6]:
words = word_tokenize(text)
print("Words:")
print(words)

Words:
['There', 'is', 'often', 'a', 'fierce', 'rivalry', 'between', 'the', 'two', 'strongest', 'teams', 'in', 'a', 'national', 'league', ',', 'and', 'this', 'is', 'particularly', 'the', 'case', 'in', 'La', 'Liga', ',', 'where', 'the', 'game', 'between', 'Real', 'Madrid', 'and', 'Barcelona', 'is', 'known', 'as', '``', 'The', 'Classic', "''", '(', 'El', 'Clásico', ')', '.', 'From', 'the', 'start', 'of', 'national', 'competitions', 'the', 'clubs', 'were', 'seen', 'as', 'representatives', 'of', 'two', 'rival', 'regions', 'in', 'Spain', ':', 'Castile', 'and', 'Catalonia', ',', 'as', 'well', 'as', 'of', 'the', 'two', 'cities', '.', 'The', 'rivalry', 'reflects', 'what', 'many', 'regard', 'as', 'the', 'political', 'and', 'cultural', 'tensions', 'felt', 'between', 'the', 'Castilians', 'and', 'Catalans', ',', 'seen', 'by', 'one', 'author', 'as', 'a', 're-enactment', 'of', 'the', 'Spanish', 'Civil', 'War', '.', '[', '254', ']', 'Over', 'the', 'years', ',', 'the', 'record', 'for', 'Real', 'Madrid

# Stopword Removal

In [7]:
stop_words = set(stopwords.words('english'))

filtered_words = [w for w in words if w.isalpha() and w.lower() not in stop_words]
print("After Stopword Removal:")
print(filtered_words)

After Stopword Removal:
['often', 'fierce', 'rivalry', 'two', 'strongest', 'teams', 'national', 'league', 'particularly', 'case', 'La', 'Liga', 'game', 'Real', 'Madrid', 'Barcelona', 'known', 'Classic', 'El', 'Clásico', 'start', 'national', 'competitions', 'clubs', 'seen', 'representatives', 'two', 'rival', 'regions', 'Spain', 'Castile', 'Catalonia', 'well', 'two', 'cities', 'rivalry', 'reflects', 'many', 'regard', 'political', 'cultural', 'tensions', 'felt', 'Castilians', 'Catalans', 'seen', 'one', 'author', 'Spanish', 'Civil', 'War', 'years', 'record', 'Real', 'Madrid', 'Barcelona', 'victories', 'Madrid', 'victories', 'Barcelona', 'draws', 'June']


# Stemming

In [8]:
stemmer = PorterStemmer()

stemmed_words = [stemmer.stem(w) for w in filtered_words]
print("Stemmed Words:")
print(stemmed_words)

Stemmed Words:
['often', 'fierc', 'rivalri', 'two', 'strongest', 'team', 'nation', 'leagu', 'particularli', 'case', 'la', 'liga', 'game', 'real', 'madrid', 'barcelona', 'known', 'classic', 'el', 'clásico', 'start', 'nation', 'competit', 'club', 'seen', 'repres', 'two', 'rival', 'region', 'spain', 'castil', 'catalonia', 'well', 'two', 'citi', 'rivalri', 'reflect', 'mani', 'regard', 'polit', 'cultur', 'tension', 'felt', 'castilian', 'catalan', 'seen', 'one', 'author', 'spanish', 'civil', 'war', 'year', 'record', 'real', 'madrid', 'barcelona', 'victori', 'madrid', 'victori', 'barcelona', 'draw', 'june']


# Lemmatization

In [9]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]
print("Lemmatized Words:")
print(lemmatized_words)

Lemmatized Words:
['often', 'fierce', 'rivalry', 'two', 'strongest', 'team', 'national', 'league', 'particularly', 'case', 'La', 'Liga', 'game', 'Real', 'Madrid', 'Barcelona', 'known', 'Classic', 'El', 'Clásico', 'start', 'national', 'competition', 'club', 'seen', 'representative', 'two', 'rival', 'region', 'Spain', 'Castile', 'Catalonia', 'well', 'two', 'city', 'rivalry', 'reflects', 'many', 'regard', 'political', 'cultural', 'tension', 'felt', 'Castilians', 'Catalans', 'seen', 'one', 'author', 'Spanish', 'Civil', 'War', 'year', 'record', 'Real', 'Madrid', 'Barcelona', 'victory', 'Madrid', 'victory', 'Barcelona', 'draw', 'June']


# POS Tagging

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [11]:
pos_tags = pos_tag(filtered_words)

print("POS Tagged Words:")
for word, tag in pos_tags:
    print(f"{word} → {tag}")

POS Tagged Words:
often → RB
fierce → JJ
rivalry → NN
two → CD
strongest → JJS
teams → NNS
national → JJ
league → NN
particularly → RB
case → NN
La → NNP
Liga → NNP
game → NN
Real → NNP
Madrid → NNP
Barcelona → NNP
known → VBN
Classic → NNP
El → NNP
Clásico → NNP
start → VBP
national → JJ
competitions → NNS
clubs → VBP
seen → VBN
representatives → NNS
two → CD
rival → JJ
regions → NNS
Spain → NNP
Castile → NNP
Catalonia → NNP
well → RB
two → CD
cities → NNS
rivalry → VBP
reflects → VBZ
many → JJ
regard → RB
political → JJ
cultural → JJ
tensions → NNS
felt → VBD
Castilians → NNPS
Catalans → NNPS
seen → VBN
one → CD
author → NN
Spanish → JJ
Civil → NNP
War → NNP
years → NNS
record → NN
Real → NNP
Madrid → NNP
Barcelona → NNP
victories → NNS
Madrid → NNP
victories → NNS
Barcelona → NNP
draws → VBZ
June → NNP


# Word Frequency

In [12]:
from collections import Counter

word_freq = Counter(lemmatized_words)
print("Word Frequency:")
print(word_freq)

Word Frequency:
Counter({'two': 3, 'Madrid': 3, 'Barcelona': 3, 'rivalry': 2, 'national': 2, 'Real': 2, 'seen': 2, 'victory': 2, 'often': 1, 'fierce': 1, 'strongest': 1, 'team': 1, 'league': 1, 'particularly': 1, 'case': 1, 'La': 1, 'Liga': 1, 'game': 1, 'known': 1, 'Classic': 1, 'El': 1, 'Clásico': 1, 'start': 1, 'competition': 1, 'club': 1, 'representative': 1, 'rival': 1, 'region': 1, 'Spain': 1, 'Castile': 1, 'Catalonia': 1, 'well': 1, 'city': 1, 'reflects': 1, 'many': 1, 'regard': 1, 'political': 1, 'cultural': 1, 'tension': 1, 'felt': 1, 'Castilians': 1, 'Catalans': 1, 'one': 1, 'author': 1, 'Spanish': 1, 'Civil': 1, 'War': 1, 'year': 1, 'record': 1, 'draw': 1, 'June': 1})


# Named NLP Pipeline Function

In [13]:
def nlp_pipeline(text):
    words = word_tokenize(text.lower())
    words = [w for w in words if w.isalpha()]
    
    words = [w for w in words if w not in stop_words]
    lemmas = [lemmatizer.lemmatize(w) for w in words]
    pos = pos_tag(lemmas)
    
    return {
        "tokens": words,
        "lemmatized": lemmas,
        "pos_tags": pos,
        "frequency": Counter(lemmas)
    }

In [14]:
result = nlp_pipeline(text)

print("Tokens:", result["tokens"])
print("\nPOS Tags:", result["pos_tags"])
print("\nWord Frequency:", result["frequency"])

Tokens: ['often', 'fierce', 'rivalry', 'two', 'strongest', 'teams', 'national', 'league', 'particularly', 'case', 'la', 'liga', 'game', 'real', 'madrid', 'barcelona', 'known', 'classic', 'el', 'clásico', 'start', 'national', 'competitions', 'clubs', 'seen', 'representatives', 'two', 'rival', 'regions', 'spain', 'castile', 'catalonia', 'well', 'two', 'cities', 'rivalry', 'reflects', 'many', 'regard', 'political', 'cultural', 'tensions', 'felt', 'castilians', 'catalans', 'seen', 'one', 'author', 'spanish', 'civil', 'war', 'years', 'record', 'real', 'madrid', 'barcelona', 'victories', 'madrid', 'victories', 'barcelona', 'draws', 'june']

POS Tags: [('often', 'RB'), ('fierce', 'JJ'), ('rivalry', 'NN'), ('two', 'CD'), ('strongest', 'JJS'), ('team', 'NN'), ('national', 'JJ'), ('league', 'NN'), ('particularly', 'RB'), ('case', 'NN'), ('la', 'FW'), ('liga', 'FW'), ('game', 'NN'), ('real', 'JJ'), ('madrid', 'JJ'), ('barcelona', 'NN'), ('known', 'VBN'), ('classic', 'JJ'), ('el', 'NN'), ('clásico