<a href="https://colab.research.google.com/github/paridhi-pandey/natural-language-processing/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Natural Language Processing

## Vocabulary in NLP

In [None]:
# set of unique tokens in a corpus or model
tokens = ['I', 'love', 'NLP', 'I', 'love', 'Python']
vocab = set(tokens)
print(vocab)

{'love', 'NLP', 'I', 'Python'}


## Converting a given corpus to a Sparse Matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["I love NLP", "NLP loves Python", "Python is great"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print("Vocabulary:", vectorizer.vocabulary_)
print("Feature Matrix:\n", X.toarray())


Vocabulary: {'love': 2, 'nlp': 4, 'loves': 3, 'python': 5, 'is': 1, 'great': 0}
Feature Matrix:
 [[0 0 1 0 1 0]
 [0 0 0 1 1 1]
 [1 1 0 0 0 1]]


## NLTK Library

Installing NLTK Library

In [None]:
import nltk


In [None]:
nltk.download('punkt')       # For tokenizers
nltk.download('stopwords')   # For stopwords
nltk.download('wordnet')     # For lemmatization

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Installing all libraries

In [None]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

## 1) Tokenization

(Using NLTK)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

text = "NLTK is a powerful toolkit."
tokens = word_tokenize(text)
print(tokens)

['NLTK', 'is', 'a', 'powerful', 'toolkit', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(Using spaCy)

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Dr. Smith loves NLP. She's teaching it at MIT!"
doc = nlp(text)

# Word tokens
print("Word Tokens:", [token.text for token in doc])

# Sentence tokens
print("Sentences:", [sent.text for sent in doc.sents])


Word Tokens: ['Dr.', 'Smith', 'loves', 'NLP', '.', 'She', "'s", 'teaching', 'it', 'at', 'MIT', '!']
Sentences: ['Dr. Smith loves NLP.', "She's teaching it at MIT!"]


## 2) Lemmatization, Stopword Removal and POS Tagging

(Using NLTK)

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

# Input text
text = "The students are studying harder than ever and running to classes."
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)  # Get POS tags

# Lemmatize and remove stopwords
filtered = []
for token, tag in pos_tags:
    token_lower = token.lower()
    if token_lower not in stop_words and token_lower.isalpha():
        wordnet_pos = get_wordnet_pos(tag)
        lemma = lemmatizer.lemmatize(token_lower, pos=wordnet_pos)
        filtered.append(lemma)

print("Lemmas after stopword removal and POS tagging:", filtered)


Lemmas after stopword removal and POS tagging: ['student', 'study', 'hard', 'ever', 'run', 'class']


(Using spaCy)

In [None]:
import spacy

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Input text
text = "The students are studying harder than ever and running to classes."

# Process the text
doc = nlp(text)

# Filter tokens: remove stopwords and punctuation, lemmatize words
filtered = [token.lemma_ for token in doc
            if not token.is_stop and token.is_alpha]

print("Lemmas after stopword removal and POS tagging:", filtered)


## Loading Stopword Lists

(Using NLTK)

In [None]:
import nltk
from nltk.corpus import stopwords

# (Only needed once if not downloaded)
# nltk.download('stopwords')

# Load English stopwords as a set
nltk_stopwords = set(stopwords.words('english'))

# Print a few examples
print("NLTK Stopwords (sample):", list(nltk_stopwords)[:10])
print("Total NLTK stopwords:", len(nltk_stopwords))


NLTK Stopwords (sample): ['just', 'very', 'when', 'ma', 'your', 'o', 'up', "wasn't", "shouldn't", 'each']
Total NLTK stopwords: 198


(Using spaCy)

In [None]:
import spacy

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Access the stopword list
spacy_stopwords = nlp.Defaults.stop_words

# Print a few examples
print("spaCy Stopwords (sample):", list(spacy_stopwords)[:10])
print("Total spaCy stopwords:", len(spacy_stopwords))

spaCy Stopwords (sample): ['’d', 'however', 'us', 'from', 'made', 'hundred', 'never', 're', 'ten', "'ll"]
Total spaCy stopwords: 326


##MINI PROJECT TASK

(Using spaCy)

In [None]:
import spacy

# Load English language model
nlp = spacy.load("en_core_web_sm")

# Sample tweets
tweets = [
    "I'm loving the new iPhone! 😍 #Apple",
    "So tired of this rain... ☔ #mood",
    "Can't wait for the weekend. Gonna relax! 💆‍♀️ #TGIF"
]

print("Processed Tweets:\n")

for tweet in tweets:
    doc = nlp(tweet)
    cleaned_tokens = [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and token.is_alpha
    ]
    print(cleaned_tokens)

Processed Tweets:

['love', 'new', 'iphone', 'apple']
['tired', 'rain', 'mood']
['wait', 'weekend', 'go', 'to', 'relax', 'tgif']


(Using NLTK)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# POS tag mapper
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

tweets = [
    "I'm loving the new iPhone! 😍 #Apple",
    "So tired of this rain... ☔ #mood",
    "Can't wait for the weekend. Gonna relax! 💆‍♀️ #TGIF"
]

print("Processed Tweets:\n")

for tweet in tweets:
    tokens = word_tokenize(tweet)
    pos_tags = pos_tag(tokens)

    cleaned_tokens = []
    for token, tag in pos_tags:
        token_lower = token.lower()
        if token_lower.isalpha() and token_lower not in stop_words:
            wordnet_pos = get_wordnet_pos(tag)
            lemma = lemmatizer.lemmatize(token_lower, wordnet_pos)
            cleaned_tokens.append(lemma)

    print(cleaned_tokens)


Processed Tweets:

['love', 'new', 'iphone', 'apple']
['tired', 'rain', 'mood']
['ca', 'wait', 'weekend', 'gon', 'na', 'relax', 'tgif']
