### <center>NLP Data Preprocessing</center>

In [3]:
# Import important libraries

import ssl
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from IPython.display import display, Markdown

In [4]:
# download predefined dictionary

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nikhilsingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikhilsingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nikhilsingh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nikhilsingh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# The text with whic we process

text = """The global economy is facing a challenging period. In April 2023, the International Monetary Fund (IMF) projected global growth to slow to 2.8%,
down from 3.4% in 2022. Rising inflation, geopolitical tensions, and supply chain disruptions have added to the uncertainty. Major tech companies like Google,
Amazon, and Microsoft have responded by cutting jobs and scaling back investments. Meanwhile, emerging markets such as India and Brazil are showing resilience. 
Artificial Intelligence (AI) is becoming a key area of innovation and competition among nations."""

display(Markdown(text))

The global economy is facing a challenging period. In April 2023, the International Monetary Fund (IMF) projected global growth to slow to 2.8%,
down from 3.4% in 2022. Rising inflation, geopolitical tensions, and supply chain disruptions have added to the uncertainty. Major tech companies like Google,
Amazon, and Microsoft have responded by cutting jobs and scaling back investments. Meanwhile, emerging markets such as India and Brazil are showing resilience. 
Artificial Intelligence (AI) is becoming a key area of innovation and competition among nations.

In [6]:
# Let's start preprocessing

text = text.lower() # lower the case
display(Markdown(text))

the global economy is facing a challenging period. in april 2023, the international monetary fund (imf) projected global growth to slow to 2.8%,
down from 3.4% in 2022. rising inflation, geopolitical tensions, and supply chain disruptions have added to the uncertainty. major tech companies like google,
amazon, and microsoft have responded by cutting jobs and scaling back investments. meanwhile, emerging markets such as india and brazil are showing resilience. 
artificial intelligence (ai) is becoming a key area of innovation and competition among nations.

***Bag of words***

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
sentences = [
    'This is for testing statement',
    'I love playing cricket',
    'I have played lot of matches'
]

In [11]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(sentences)

In [14]:
vectorizer.vocabulary_

sorted_vocab_by_index = sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])
for word, index in sorted_vocab_by_index:
    print(f"{word}: {index}")

cricket: 0
for: 1
have: 2
is: 3
lot: 4
love: 5
matches: 6
of: 7
played: 8
playing: 9
statement: 10
testing: 11
this: 12


In [12]:
x.toarray()

array([[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0]])

***TF-IDF***

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(sentences)

sorted_vocab_by_index = sorted(vectorizer.vocabulary_.items(), key=lambda item: item[1])
for word, index in sorted_vocab_by_index:
    print(f"{word}: {index}")

cricket: 0
for: 1
have: 2
is: 3
lot: 4
love: 5
matches: 6
of: 7
played: 8
playing: 9
statement: 10
testing: 11
this: 12


In [16]:
x.toarray()

array([[0.        , 0.4472136 , 0.        , 0.4472136 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.4472136 , 0.4472136 , 0.4472136 ],
       [0.57735027, 0.        , 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.4472136 , 0.        , 0.4472136 ,
        0.        , 0.4472136 , 0.4472136 , 0.4472136 , 0.        ,
        0.        , 0.        , 0.        ]])

***TF-IDF***

- Term Frequency Inverse Document frequency
- we try to count the TF and IDF of each words

***word to vector***

In [3]:
import re
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [18]:
sentences = [
    'This is for testing statement',
    'I love playing cricket',
    'I have played lot of matches'
]

# Tokenize each sentence
tokenized_sentences = [re.findall(r'\b\w+\b', sentence) for sentence in sentences]

tokenized_sentences

[['This', 'is', 'for', 'testing', 'statement'],
 ['I', 'love', 'playing', 'cricket'],
 ['I', 'have', 'played', 'lot', 'of', 'matches']]

In [21]:
# sending this token to word2vec mode;

model = Word2Vec(tokenized_sentences,vector_size=100,window=5,min_count=1,workers=4,sg=1)
model

<gensim.models.word2vec.Word2Vec at 0x176163850>

In [22]:
model.wv['cricket']

array([ 8.13227147e-03, -4.45733406e-03, -1.06835726e-03,  1.00636482e-03,
       -1.91113955e-04,  1.14817743e-03,  6.11386076e-03, -2.02715401e-05,
       -3.24596534e-03, -1.51072862e-03,  5.89729892e-03,  1.51410222e-03,
       -7.24261976e-04,  9.33324732e-03, -4.92128357e-03, -8.38409644e-04,
        9.17541143e-03,  6.74942741e-03,  1.50285603e-03, -8.88256077e-03,
        1.14874600e-03, -2.28825561e-03,  9.36823711e-03,  1.20992784e-03,
        1.49006362e-03,  2.40640994e-03, -1.83600665e-03, -4.99963388e-03,
        2.32429506e-04, -2.01418041e-03,  6.60093315e-03,  8.94012302e-03,
       -6.74754381e-04,  2.97701475e-03, -6.10765442e-03,  1.69932481e-03,
       -6.92623248e-03, -8.69402662e-03, -5.90020278e-03, -8.95647518e-03,
        7.27759488e-03, -5.77203138e-03,  8.27635173e-03, -7.24354526e-03,
        3.42167495e-03,  9.67499893e-03, -7.78544787e-03, -9.94505733e-03,
       -4.32914635e-03, -2.68313056e-03, -2.71289347e-04, -8.83155130e-03,
       -8.61755759e-03,  