# Stemming in NLP

In [1]:
import nltk 
nltk.download('punkt')  # Download the required resource (tokenizer models) 
#!pip install nltk 

In [2]:
word = ['change','changing','changes','changed'] 

In [3]:
word 

['change', 'changing', 'changes', 'changed']

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rashe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from nltk.stem import PorterStemmer

In [6]:
p = PorterStemmer()

In [8]:
for w in word:
    print(w, p.stem(w))

change chang
changing chang
changes chang
changed chang


In [9]:
for w in word:
    print(w , p.stem(w))

change chang
changing chang
changes chang
changed chang


In [10]:
sen = 'I want to change the world if world changed my career by changing abcd'

In [11]:
sen

'I want to change the world if world changed my career by changing abcd'

In [12]:
from nltk.tokenize import word_tokenize

In [13]:
toke = word_tokenize(sen)

In [14]:
toke

['I',
 'want',
 'to',
 'change',
 'the',
 'world',
 'if',
 'world',
 'changed',
 'my',
 'career',
 'by',
 'changing',
 'abcd']

In [15]:
sen.split()

['I',
 'want',
 'to',
 'change',
 'the',
 'world',
 'if',
 'world',
 'changed',
 'my',
 'career',
 'by',
 'changing',
 'abcd']

In [16]:
for w in toke:
    print(w , p.stem(w))

I i
want want
to to
change chang
the the
world world
if if
world world
changed chang
my my
career career
by by
changing chang
abcd abcd


In [None]:
#nltk.download('punkt')

In [None]:
#nltk.download('wordnet')

# Lemmatization in NLP

In [17]:
from nltk.stem import WordNetLemmatizer

In [18]:
le = WordNetLemmatizer()

In [19]:
toke

['I',
 'want',
 'to',
 'change',
 'the',
 'world',
 'if',
 'world',
 'changed',
 'my',
 'career',
 'by',
 'changing',
 'abcd']

In [20]:
for w in toke:
    print(w , le.lemmatize(w))

I I
want want
to to
change change
the the
world world
if if
world world
changed changed
my my
career career
by by
changing changing
abcd abcd


In [21]:
le.lemmatize('changes')

'change'

# Tokenization in NLP

In Python, there are several libraries and tools available for performing tokenization and other NLP tasks. Here are a few examples using popular libraries

# NLTK

NLTK (Natural Language Toolkit) is a widely used library for NLP tasks. To perform tokenization using NLTK, you need to install it first. You can do so by running pip install nltk. Here's an example of tokenizing a sentence using NLTK

In [22]:
from nltk.tokenize import word_tokenize, sent_tokenize

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
word_tokens = word_tokenize(sentence)
sentence_tokens = sent_tokenize(sentence)

print(word_tokens)
print(sentence_tokens)


['I', "'m", 'from', 'aiQuest', 'Intelligence', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']
["I'm from aiQuest Intelligence.", 'I am learning NLP.', 'It is fascinating!']


# spaCy

spaCy is another powerful library for NLP. To install spaCy, you can run pip install spacy and then download the appropriate language model. Here's an example of tokenization using spaCy

In [None]:
#!pip install spacy
# python -m spacy download en_core_web_sm    -> install in conda

In [23]:
import spacy

nlp = spacy.load('en_core_web_sm')  # Load the English language model

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
doc = nlp(sentence)

word_tokens = [token.text for token in doc]

print(word_tokens)


['I', "'m", 'from', 'aiQuest', 'Intelligence', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']


# Transformers

Transformers is a library built by Hugging Face that provides state-of-the-art pre-trained models for NLP. It offers various functionalities, including tokenization. To install Transformers, run pip install transformers. Here's an example of tokenization using Transformers

In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
tokens = tokenizer.tokenize(sentence)

print(tokens)


['i', "'", 'm', 'from', 'ai', '##quest', 'intelligence', '.', 'i', 'am', 'learning', 'nl', '##p', '.', 'it', 'is', 'fascinating', '!']


# Named Entity Tokenization using NLTK

To perform named entity tokenization using NLTK (Natural Language Toolkit), you can utilize the named entity recognition (NER) functionality provided by NLTK. Here's an example of how to extract named entity tokens from a sentence using NLTK

In [25]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rashe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [26]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rashe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
import nltk
nltk.download('maxent_ne_chunker')  # Download the required resource (NER models)
nltk.download('words')  # Download the required resource (word corpus) 

from nltk import word_tokenize, pos_tag, ne_chunk

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"

# Tokenize the sentence into words
tokens = word_tokenize(sentence)

# Perform part-of-speech tagging
pos_tags = pos_tag(tokens)

# Perform named entity recognition
ner_tags = ne_chunk(pos_tags) 

# Extract named entity tokens
named_entity_tokens = []

for chunk in ner_tags:
    if hasattr(chunk, 'label'): #hasattr(object, attribute)
        
        named_entity_tokens.append(' '.join(c[0] for c in chunk))

print(named_entity_tokens)


['aiQuest Intelligence', 'NLP']


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\rashe\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\rashe\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


# Text Vectorizer

In [28]:
import pandas as pd
df = pd.read_csv('data.csv')

In [29]:
df

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


# CountVectorizer

In [30]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [31]:
cv = CountVectorizer()

In [32]:
cv_x = cv.fit_transform(df['test'])
cv_x

<4x14 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [33]:
cv_x.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]], dtype=int64)

In [34]:
cv.get_feature_names()



['an',
 'are',
 'bangladesh',
 'could',
 'give',
 'hello',
 'how',
 'iphone',
 'love',
 'me',
 'talk',
 'to',
 'want',
 'you']

In [40]:
cv_df = pd.DataFrame(cv_x.toarray(), columns=cv.get_feature_names())
cv_df



Unnamed: 0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
1,1,0,0,1,1,0,0,1,0,1,0,0,0,1
2,0,1,0,0,0,1,1,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1,1,1,1


In [41]:
cv_df = pd.DataFrame(cv_x.toarray(), columns=cv.get_feature_names(), index=df['test'])



In [42]:
cv_df

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0,0,1,0,0,0,0,0,1,0,0,0,0,0
Could you give me an iphone?,1,0,0,1,1,0,0,1,0,1,0,0,0,1
Hello how are you?,0,1,0,0,0,1,1,0,0,0,0,0,0,1
I want to talk you.,0,0,0,0,0,0,0,0,0,0,1,1,1,1


# TfidfVectorizer

In [43]:
tf = TfidfVectorizer()

In [45]:
tf_z = tf.fit_transform(df['test'])

In [46]:
tf_z

<4x14 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [47]:
cv_df = pd.DataFrame(tf_z.toarray(), columns=tf.get_feature_names(), index=df['test'])



In [48]:
cv_df

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0
Could you give me an iphone?,0.430037,0.0,0.0,0.430037,0.430037,0.0,0.0,0.430037,0.0,0.430037,0.0,0.0,0.0,0.274487
Hello how are you?,0.0,0.541736,0.0,0.0,0.0,0.541736,0.541736,0.0,0.0,0.0,0.0,0.0,0.0,0.345783
I want to talk you.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541736,0.541736,0.541736,0.345783


# Word2Vec

In [49]:
!pip install gensim



In [50]:
from gensim.models import Word2Vec, KeyedVectors

In [51]:
text_vector = [nltk.word_tokenize(test) for test in df['test']]
text_vector

[['I', 'love', 'Bangladesh'],
 ['Could', 'you', 'give', 'me', 'an', 'iphone', '?'],
 ['Hello', 'how', 'are', 'you', '?'],
 ['I', 'want', 'to', 'talk', 'you', '.']]

In [52]:
model = Word2Vec(text_vector, min_count=1) #shift+tab

In [53]:
model.wv.most_similar('want')

[('an', 0.1782679259777069),
 ('I', 0.16072483360767365),
 ('give', 0.10560771077871323),
 ('how', 0.09215972572565079),
 ('iphone', 0.04891003668308258),
 ('are', 0.02700834721326828),
 ('Could', 0.007729316595941782),
 ('you', -0.03771639242768288),
 ('.', -0.045522838830947876),
 ('talk', -0.04649210348725319)]