# Stemming in NLP

In [14]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable


In [17]:
#nltk.download('punkt')

In [18]:
import nltk 
import warnings
warnings.filterwarnings('ignore')

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\obaidulhaque.sarker\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
word = ['change','changing','changes','changed'] 

In [21]:
word 

['change', 'changing', 'changes', 'changed']

In [22]:
from nltk.stem import PorterStemmer

In [23]:
p = PorterStemmer()

In [24]:
for w in word:
    print(p.stem(w))

chang
chang
chang
chang


In [25]:
for w in word:
    print(w , p.stem(w))

change chang
changing chang
changes chang
changed chang


In [26]:
sen = 'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [27]:
sen

'The constant flux of life necessitates embracing change, whether its adapting to the changes around us or actively changing ourselves to meet new challenges.'

In [28]:
from nltk.tokenize import word_tokenize

In [29]:
token = word_tokenize(sen)

In [30]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [31]:
#sen.split()

In [32]:
for w in token:
    print(p.stem(w))

the
constant
flux
of
life
necessit
embrac
chang
,
whether
it
adapt
to
the
chang
around
us
or
activ
chang
ourselv
to
meet
new
challeng
.


In [33]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\obaidulhaque.sarker\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package wordnet is already up-to-date!


True

# Lemmatization in NLP

In [34]:
from nltk.stem import WordNetLemmatizer

In [35]:
le = WordNetLemmatizer()

In [36]:
token

['The',
 'constant',
 'flux',
 'of',
 'life',
 'necessitates',
 'embracing',
 'change',
 ',',
 'whether',
 'its',
 'adapting',
 'to',
 'the',
 'changes',
 'around',
 'us',
 'or',
 'actively',
 'changing',
 'ourselves',
 'to',
 'meet',
 'new',
 'challenges',
 '.']

In [37]:
for w in token:
    print(le.lemmatize(w))

The
constant
flux
of
life
necessitates
embracing
change
,
whether
it
adapting
to
the
change
around
u
or
actively
changing
ourselves
to
meet
new
challenge
.


# Tokenization in NLP

In Python, there are several libraries and tools available for performing tokenization and other NLP tasks. Here are a few examples using popular libraries

# NLTK

NLTK (Natural Language Toolkit) is a widely used library for NLP tasks. To perform tokenization using NLTK, you need to install it first. You can do so by running pip install nltk. Here's an example of tokenizing a sentence using NLTK

In [38]:
from nltk.tokenize import word_tokenize, sent_tokenize

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
word_tokens = word_tokenize(sentence)
sentence_tokens = sent_tokenize(sentence)

print(word_tokens)
print(sentence_tokens)


['I', "'m", 'from', 'aiQuest', 'Intelligence', '.', 'I', 'am', 'learning', 'NLP', '.', 'It', 'is', 'fascinating', '!']
["I'm from aiQuest Intelligence.", 'I am learning NLP.', 'It is fascinating!']


# spaCy

spaCy is another powerful library for NLP. To install spaCy, you can run pip install spacy and then download the appropriate language model. Here's an example of tokenization using spaCy

In [43]:
!pip install spacy
#python -m spacy download en_core_web_sm    -> install in conda

Defaulting to user installation because normal site-packages is not writeable


In [48]:
!python -m spacy download en_core_web_lg 

C:\Python310\python.exe: No module named spacy


In [45]:
import spacy

spc = spacy.load('en_core_web_sm')  # Load the English language model

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
doc = spc(sentence)

word_tokens = [token.text for token in doc]

print(word_tokens)


OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

# Transformers

Transformers is a library built by Hugging Face that provides state-of-the-art pre-trained models for NLP. It offers various functionalities, including tokenization. To install Transformers, run pip install transformers. Here's an example of tokenization using Transformers

In [41]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!"
tokens = tokenizer.tokenize(sentence)

print(tokens)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

['i', "'", 'm', 'from', 'ai', '##quest', 'intelligence', '.', 'i', 'am', 'learning', 'nl', '##p', '.', 'it', 'is', 'fascinating', '!']


# Named Entity Tokenization using NLTK

To perform named entity tokenization using NLTK (Natural Language Toolkit), you can utilize the named entity recognition (NER) functionality provided by NLTK. Here's an example of how to extract named entity tokens from a sentence using NLTK

In [None]:
#nltk.download('wordnet')
#nltk.download('maxent_ne_chunker')  # Download the required resource (NER models)
#nltk.download('words')  # Download the required resource (word corpus) 
#nltk.download('averaged_perceptron_tagger')

In [None]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

sentence = "I'm from aiQuest Intelligence. I am learning NLP. It is fascinating!, Hasan vai, my name is Joe"

tokens = word_tokenize(sentence) # Tokenize the sentence into words

pos_tags = pos_tag(tokens) # Perform part-of-speech tagging

ner_tags = ne_chunk(pos_tags) # Perform named entity recognition

named_entity_tokens = []

for chunk in ner_tags:
    if hasattr(chunk, 'label'):
        named_entity_tokens.append(' '.join(c[0] for c in chunk))

print(named_entity_tokens)

In [None]:
sentence2 = "Shakil Lives in Germany"
tokens = word_tokenize(sentence2)
pos_tags = pos_tag(tokens)

In [None]:
pos_tags

# Text Vectorizer

In [1]:
import pandas as pd
df = pd.read_excel('data/data.xlsx')

In [2]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


# Text Processing

In [13]:
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english')) 
en_stopwords

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\obaidulhaque.sarker/nltk_data'
    - 'D:\\ProgramData\\anaconda3\\nltk_data'
    - 'D:\\ProgramData\\anaconda3\\share\\nltk_data'
    - 'D:\\ProgramData\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\obaidulhaque.sarker\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
stopwords.fileids()

In [6]:
import string
string.punctuation 

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
len(string.punctuation)

32

In [8]:
li = [1,2,3,4,54]
[l for l in li ]

[1, 2, 3, 4, 54]

In [9]:
[l for l in li if l%2==0]

[2, 4, 54]

In [10]:
df

Unnamed: 0,text,class
0,"Hey, I love Bangladesh;",1
1,"Good afternoon, I am happy!",1
2,I live in Germany,1
3,Nice to meet you man-,1
4,You won an iPhone,0


In [11]:
def preprocess_text(text): 
    
    remove_punc = [char for char in text if char not in string.punctuation] # Remove punctuation
    clean_words = ''.join(remove_punc) # char joining
    
    #Remove stopwords
    text = ([word for word in clean_words.split() if word.lower() not in en_stopwords]) # stopword = stopwords.words('english')
    return text

In [12]:
df['text'] = df['text'].apply(preprocess_text) 

NameError: name 'en_stopwords' is not defined

In [None]:
df['text']

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text])
    return lemmatized_text

In [None]:
df['text'] = df['text'].apply(lemmatize_text)
df.head()

# CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
cv_x = cv.fit_transform(df['text'])
cv_x

In [None]:
cv_x.toarray()

In [None]:
cv_df = pd.DataFrame(cv_x.toarray())
cv_df

In [None]:
cv.get_feature_names_out()

In [None]:
cv_df = pd.DataFrame(cv_x.toarray(), index=df['text'], columns=cv.get_feature_names_out())

In [None]:
cv_df

# TfidfVectorizer

In [None]:
tf = TfidfVectorizer()

In [None]:
tf_z = tf.fit_transform(df['text'])

In [None]:
tf_z

In [None]:
cv_df = pd.DataFrame(tf_z.toarray(), index=df['text'], columns=tf.get_feature_names_out())

In [None]:
cv_df

# Word2Vec

In [None]:
#!pip install gensim

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
text_vector = [nltk.word_tokenize(test) for test in df['text']]
text_vector

In [None]:
model = Word2Vec(text_vector, min_count=1) #shift+tab

In [None]:
model

In [None]:
model.wv.most_similar('happy')