# Stemming NLP

In [26]:
#pip install nltk

In [27]:
import nltk
import warnings
warnings.filterwarnings('ignore')

In [28]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
word=['change','changing','changes','changed']

In [30]:
word

['change', 'changing', 'changes', 'changed']

In [31]:
from nltk.stem import PorterStemmer

In [32]:
p=PorterStemmer()

In [33]:
for w in word:
    print(p.stem(w))

chang
chang
chang
chang


In [34]:
for w in word:
    print(w,p.stem(w))

change chang
changing chang
changes chang
changed chang


In [35]:
sen='The constant flux of life necessitates embraching change,whether its adapting to the changes around us or actively changing ourselves to meet new challenges. '

In [36]:
sen

'The constant flux of life necessitates embraching change,whether its adapting to the changes around us or actively changing ourselves to meet new challenges. '

In [37]:
from nltk.tokenize import word_tokenize

In [38]:
token=word_tokenize(sen)

In [39]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embraching',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [40]:
#sen.split()

In [41]:
for w in token:
    print(p.stem(w))

the
constant
flux
of
life
necessit
embrach
chang
,
whether
it
adapt
to
the
chang
around
us
or
activ
chang
ourselv
to
meet
new
challeng
.


# Lematization in NLP

In [42]:
from nltk.stem import WordNetLemmatizer

In [43]:
le=WordNetLemmatizer()

In [44]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embraching',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [45]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
for w in token:
    print(le.lemmatize(w))

The
constant
flux
of
life
necessitates
embraching
change
,
whether
it
adapting
to
the
change
around
u
or
actively
changing
ourselves
to
meet
new
challenge
.


# Tokenization in NLP

In [47]:
from nltk.tokenize import word_tokenize,sent_tokenize
sentence= 'i am from bangladesh.i am learning Nlp.it is fascinating'
word_tokens=word_tokenize(sentence)
sentence_tokens=sent_tokenize(sentence)
print(word_tokens)
print(sentence_tokens)

['i', 'am', 'from', 'bangladesh.i', 'am', 'learning', 'Nlp.it', 'is', 'fascinating']
['i am from bangladesh.i am learning Nlp.it is fascinating']


# spaCY

spaCy is fast and efficient at runtime, making it a good choice for building production-level NLP applications. 

In [48]:
#!pip install spacy
# python -m spacy download en_core_web_sm   < installed in conda

In [49]:
import spacy

spc = spacy.load('en_core_web_sm')  # Load the English language model

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
doc = spc(sentence)

word_tokens = [token.text for token in doc]

print(word_tokens)

['I', "'m", 'from', 'aiQuest', 'Intelligence', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']


# Transformers

Transformers is a library built by Hugging Face that provides state-of-the-art pre-trained models for NLP. It offers various functionalities, including tokenization. To install Transformers, run pip install transformers. Here's an example of tokenization using Transformers

In [50]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
tokens = tokenizer.tokenize(sentence)

print(tokens)

['i', "'", 'm', 'from', 'ai', '##quest', 'intelligence', '.', 'i', 'am', 'learning', 'nl', '##p', '.', 'it', 'is', 'fascinating', '!']


# Named Entity Tokenization using NLTK

To perform named entity tokenization using NLTK (Natural Language Toolkit), you can utilize the named entity recognition (NER) functionality provided by NLTK. Here's an example of how to extract named entity tokens from a sentence using NLTK

In [51]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!, Hasan vai, my name is Joe"

tokens = word_tokenize(sentence) # Tokenize the sentence into words

pos_tags = pos_tag(tokens) # Perform part-of-speech tagging

ner_tags = ne_chunk(pos_tags) # Perform named entity recognition

named_entity_tokens = []

for chunk in ner_tags:
    if hasattr(chunk, 'label'):
        named_entity_tokens.append(' '.join(c[0] for c in chunk))

print(named_entity_tokens)

['aiQuest Intelligence', 'NLP', 'Hasan', 'Joe']


In [52]:
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')  # Download the required resource (NER models)
nltk.download('words')  # Download the required resource (word corpus) 
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [53]:
sentence2 = "Shakil Lives in Germany"
tokens = word_tokenize(sentence2)
pos_tags = pos_tag(tokens)

# Text Vectorizer

In [54]:
import pandas as pd
df = pd.read_excel('data.xlsx')

In [55]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


# Text Processing

In [56]:
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english')) 
en_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [57]:
nltk.download('stopwords')
  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [58]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [59]:
import string
string.punctuation 

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [60]:
len(string.punctuation)

32

In [61]:
li = [1,2,3,4,54]
[l for l in li ]

[1, 2, 3, 4, 54]

In [62]:
[l for l in li if l%2==0]

[2, 4, 54]

In [63]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


In [64]:
def preprocess_text(text): 
    
    remove_punc = [char for char in text if char not in string.punctuation] # Remove punctuation
    clean_words = ''.join(remove_punc) # char joining
    
    #Remove stopwords
    text = ([word for word in clean_words.split() if word.lower() not in en_stopwords]) # stopword = stopwords.words('english')
    return text

In [65]:
df['text'] = df['text'].apply(preprocess_text) 

In [66]:
df['text']

0     [Hey, love, Bangladesh]
1    [Good, afternoon, happy]
2             [live, Germany]
3           [Nice, meet, man]
4                    [iPhone]
Name: text, dtype: object

In [67]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text])
    return lemmatized_text

In [68]:
df['text'] = df['text'].apply(lemmatize_text)
df.head()

Unnamed: 0,text,class
0,Hey love Bangladesh,1
1,Good afternoon happy,1
2,live Germany,1
3,Nice meet man,1
4,iPhone,0


# CountVectorizer

In [69]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [70]:
cv = CountVectorizer()

In [71]:
cv_x = cv.fit_transform(df['text'])
cv_x

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [72]:
cv_x.toarray()

array([[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]], dtype=int64)

In [73]:
cv_df = pd.DataFrame(cv_x.toarray())
cv_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,1,0,0,0,1,0,0,1,0,0,0
1,1,0,0,1,1,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,1,1
4,0,0,0,0,0,0,1,0,0,0,0,0


In [74]:
cv.get_feature_names_out()

array(['afternoon', 'bangladesh', 'germany', 'good', 'happy', 'hey',
       'iphone', 'live', 'love', 'man', 'meet', 'nice'], dtype=object)

In [75]:
cv_df = pd.DataFrame(cv_x.toarray(), index=df['text'], columns=cv.get_feature_names_out())

In [76]:
cv_df

Unnamed: 0_level_0,afternoon,bangladesh,germany,good,happy,hey,iphone,live,love,man,meet,nice
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Hey love Bangladesh,0,1,0,0,0,1,0,0,1,0,0,0
Good afternoon happy,1,0,0,1,1,0,0,0,0,0,0,0
live Germany,0,0,1,0,0,0,0,1,0,0,0,0
Nice meet man,0,0,0,0,0,0,0,0,0,1,1,1
iPhone,0,0,0,0,0,0,1,0,0,0,0,0


# TfidfVectorizer

In [77]:
tf = TfidfVectorizer()

In [78]:
tf_z = tf.fit_transform(df['text'])

In [79]:
tf_z

<5x12 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [80]:
cv_df = pd.DataFrame(tf_z.toarray(), index=df['text'], columns=tf.get_feature_names_out())

In [81]:
cv_df

Unnamed: 0_level_0,afternoon,bangladesh,germany,good,happy,hey,iphone,live,love,man,meet,nice
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Hey love Bangladesh,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.0
Good afternoon happy,0.57735,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0
live Germany,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0
Nice meet man,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.57735
iPhone,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Word2Vec

In [84]:
#!pip install gensim

In [85]:
from gensim.models import Word2Vec, KeyedVectors

In [86]:
text_vector = [nltk.word_tokenize(test) for test in df['text']]
text_vector

[['Hey', 'love', 'Bangladesh'],
 ['Good', 'afternoon', 'happy'],
 ['live', 'Germany'],
 ['Nice', 'meet', 'man'],
 ['iPhone']]

In [87]:
model = Word2Vec(text_vector, min_count=1) #shift+tab

In [88]:
model

<gensim.models.word2vec.Word2Vec at 0x2477481d6d0>

In [89]:
model.wv.most_similar('happy')

[('meet', 0.14595060050487518),
 ('love', 0.05048206076025963),
 ('Nice', 0.04157734662294388),
 ('Germany', 0.03476495295763016),
 ('live', 0.019152265042066574),
 ('iPhone', 0.016134697943925858),
 ('Good', 0.008826171047985554),
 ('afternoon', 0.004842488560825586),
 ('Bangladesh', 0.0019510717829689384),
 ('Hey', -0.08382602781057358)]