#**Remove Stop Words**

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
text = 'This is a sample Sentence'
words = word_tokenize(text)
print(words)
filtered = [w for w in words if w.lower() not in stopwords.words('english')] #fetches the words that are not stopwords
print(filtered)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['This', 'is', 'a', 'sample', 'Sentence']
['sample', 'Sentence']


#**Tokenization**

In [None]:
from nltk.tokenize import word_tokenize
print(word_tokenize('This is a sentence'))


['This', 'is', 'a', 'sentence']


#**Stemming**

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('Running'))

run


#**Lemmatization**

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('running',pos='v'))

[nltk_data] Downloading package wordnet to /root/nltk_data...


run


'v' means verb

'n' means noun (this is the default if you don’t specify pos)

'a' means adjective


#**Lemmatization V/s Stemming**

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

word1 = 'running'
word2 = 'better'
word3 = 'studies'

print('Stemming:')
print(stemmer.stem(word1))
print(stemmer.stem(word2))
print(stemmer.stem(word3))

print('\nLemmatization')
print(lemmatizer.lemmatize(word1,pos='v'))
print(lemmatizer.lemmatize(word2,pos='a'))
print(lemmatizer.lemmatize(word3,pos='n'))



Stemming:
run
better
studi

Lemmatization
run
good
study


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#***POS***

In [None]:
import nltk
#Downlaod the POS Tagger data
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt') #Also needed for word_tokenize

from  nltk.tokenize import word_tokenize
#Tokenize the sentence
tokens =  word_tokenize('This is a sample sentence')

#Get POS Tags
print(nltk.pos_tag(tokens))

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NN')]


#**Named Entity Recogonition**

In [1]:
import spacy
nlp =  spacy.load('en_core_web_sm')
doc = nlp('Barack Obama was born in Hawai')
print([(ent.text,ent.label_)for ent in doc.ents])


[('Barack Obama', 'PERSON'), ('Hawai', 'GPE')]


In [6]:
import spacy #this is library contains ready to use functions.
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]


#**Chunking**

In [5]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
text =  "The quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(text)
tags = nltk.pos_tag(tokens)
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" #This is for picking determinant,noun etc
cp = nltk.RegexpParser(chunk_grammar)
tree = cp.parse(tags)
tree.pretty_print()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


                                     S                                 
     ________________________________|______________________            
    |        |              NP               NP             NP         
    |        |       _______|________        |       _______|______     
jumps/VBZ over/IN The/DT quick/JJ brown/NN fox/NN the/DT lazy/JJ dog/NN



#**Word Emdedding or Vectorization**

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(['This is a sample','This is another example'])
print(X.toarray())


[[0 0 1 1 1]
 [1 1 1 0 1]]


- Index:   0    1      2      3        4
- Words: ['another', 'example', 'is', 'sample', 'this'] Here check another is present in first sentence. since its not there 0 similarly example is not there so 0. 'is' is present so 1 like that we are getting the output.  

In [10]:
print(cv.get_feature_names_out()) #This gives the bag of words

['another' 'example' 'is' 'sample' 'this']


#**TF-IDF**

- TF = Term Frequency → How often a word appears in a document.

- IDF = Inverse Document Frequency → Measures how important a word is across all documents.

 - Words that appear in many documents get down-weighted.

- TF-IDF = TF × IDF → Higher value means the word is important for that document but rare across others.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
X = tf.fit_transform(["This is a sample", "This is another example"])
print(tf.get_feature_names_out()) # this gives the bag of words
print(X.toarray())

['another' 'example' 'is' 'sample' 'this']
[[0.         0.         0.50154891 0.70490949 0.50154891]
 [0.57615236 0.57615236 0.40993715 0.         0.40993715]]


This is a sample

- another : 0
- example: 0
- is: 0.5 (TF-ID Score)
- sample - 0.75 (TF-ID Score more unique so TFID score will be more)
- this - 0.5

This is another example
- another : 0.5
- example: 0.5 (This is similar to sample so TF-ID score will be less bcz it already learned
- is: 0.5 (TF-ID Score)
- sample - 0
- this - 0.5 (Repeating word so score would be same)

#**n-grams range with Countvectorizer**

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
text = ["I love NLP"]
cv = CountVectorizer(ngram_range=(1, 3)) #(1,3) determines how the model should study. it check simgle words bigrams and trigrams
print(cv)
X = cv.fit_transform(text)
print(X)
print(cv.get_feature_names_out())

CountVectorizer(ngram_range=(1, 3))
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3 stored elements and shape (1, 3)>
  Coords	Values
  (0, 0)	1
  (0, 2)	1
  (0, 1)	1
['love' 'love nlp' 'nlp']


- (1, 1) (default)	Only single words (unigrams)
- (1, 2)	Unigrams and bigrams
- (2, 2)	Only bigrams
- (1, 3)	Unigrams, bigrams, and trigrams

#**Sentiment Analysis**

In [6]:
from textblob import TextBlob
print(TextBlob("I love this product").sentiment) #xtblob library is used for sentiment analysis

Sentiment(polarity=0.5, subjectivity=0.6)


| Term             | Simple meaning                     |
| ---------------- | ---------------------------------- |
| **Polarity**     | How negative or positive it is     |
| **Subjectivity** | How much of it is opinion vs. fact |


 <b>Polarity: 0.5 </b> <br> <hr>
Range: from -1.0 (most negative) to +1.0 (most positive) <br>
Meaning: A value of 0.5 indicates a moderately positive sentiment. <br>
Negative: < 0 <br>
Neutral: 0 <br>
Positive: > 0 <br>

 <b>Subjectivity: 0.6 </b> <br> <hr>
Range: from 0.0 (objective) to 1.0 (subjective)<br>
Meaning: A value of 0.6 suggests the text is fairly subjective, meaning it's based more on personal opinion or feelings than on factual information.



| Term             | Range                      | Meaning                                      |
| ---------------- | -------------------------- | -------------------------------------------- |
| **Polarity**     | `-1.0` to `+1.0`           | How **negative** or **positive** the text is |
|                  | `-1.0` → Very negative     | Example: *“I hate this movie.”*              |
|                  | `0.0` → Neutral            | Example: *“It is a movie.”*                  |
|                  | `+1.0` → Very positive     | Example: *“I love this movie!”*              |
|                  |                            |                                              |
| **Subjectivity** | `0.0` to `1.0`             | How much is **opinion** vs. **fact**         |
|                  | `0.0` → Completely factual | Example: *“The sun rises in the east.”*      |
|                  | `1.0` → Fully opinionated  | Example: *“I think this is the best ever!”*  |


#**Text Classification with pipeline**

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#Example text data (spam detection style)
texts = [
    "Buy now",
    "Limited offer",
    "Call me later",
    "Meeting at noon",
    "Free coupons",
    "Let’s have lunch",
    "Win a prize now",
    "Project discussion tomorrow"
]
labels = [
    "spam",  # Buy now
    "spam",  # Limited offer
    "ham",   # Call me later
    "ham",   # Meeting at noon
    "spam",  # Free coupons
    "ham",   # Let’s have lunch
    "spam",  # Win a prize now
    "ham"    # Project discussion tomorrow
]

#Split into training data and testing data
x_train,x_test,y_train,y_test = train_test_split(
    texts, labels, test_size=0.25,random_state=42
)

#Create the pipeline
pipe = Pipeline([
       ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

#Train the model
pipe.fit(x_train,y_train)

#Make the predictions
preds = pipe.predict(x_test)

#  Evaluate the model
print("Test predictions:", preds)
print("Actual labels:   ", y_test)
print("Accuracy:", accuracy_score(y_test, preds))

#  Try a new example
new_text = ["Congratulations, you won a free ticket!"]
new_pred = pipe.predict(new_text)
print("\nNew text:", new_text[0])
print("Predicted label:", new_pred[0])


Test predictions: ['ham' 'ham']
Actual labels:    ['spam', 'ham']
Accuracy: 0.5

New text: Congratulations, you won a free ticket!
Predicted label: spam
