## 1.Introduction to NLP

In [None]:
!pip install spacy



In [None]:
!pip install nltk



In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")

for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [None]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

Dr.
Strange
loves
pav
bhaji
of
mumbai
.
Hulk
loves
chat
of
delhi


In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize

sent_tokenize("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")

['Dr.', 'Strange loves pav bhaji of mumbai.', 'Hulk loves chat of delhi']

In [None]:
from nltk.tokenize import word_tokenize

word_tokenize("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")

['Dr',
 '.',
 'Strange',
 'loves',
 'pav',
 'bhaji',
 'of',
 'mumbai',
 '.',
 'Hulk',
 'loves',
 'chat',
 'of',
 'delhi']

## 2.DATA PREPROCESSING STEPS FOR NLP

### Tokenization in Spacy

In [None]:
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")
for token in doc:
  print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [None]:
for token in doc:
  print(token,"==>","index",token.i,"is_alpha:",token.is_alpha,
        "is_punct:",token.is_punct,
        "like_num:",token.like_num,
        "is_currency",token.is_currency
        )

Dr. ==> index 0 is_alpha: False is_punct: False like_num: False is_currency False
Strange ==> index 1 is_alpha: True is_punct: False like_num: False is_currency False
loves ==> index 2 is_alpha: True is_punct: False like_num: False is_currency False
pav ==> index 3 is_alpha: True is_punct: False like_num: False is_currency False
bhaji ==> index 4 is_alpha: True is_punct: False like_num: False is_currency False
of ==> index 5 is_alpha: True is_punct: False like_num: False is_currency False
mumbai ==> index 6 is_alpha: True is_punct: False like_num: False is_currency False
as ==> index 7 is_alpha: True is_punct: False like_num: False is_currency False
it ==> index 8 is_alpha: True is_punct: False like_num: False is_currency False
costs ==> index 9 is_alpha: True is_punct: False like_num: False is_currency False
only ==> index 10 is_alpha: True is_punct: False like_num: False is_currency False
2 ==> index 11 is_alpha: False is_punct: False like_num: True is_currency False
$ ==> index 12 i

In [None]:
nlp = spacy.blank("bn")

In [None]:
doc = nlp("আমি কোডিং করছি তুমি কি করছ ? আমার মন ভাল নেই ")

for token in doc:
  print(token, token.is_currency)

আমি False
কোডিং False
করছি False
তুমি False
কি False
করছ False
? False
আমার False
মন False
ভাল False
নেই False


In [None]:
# customize token rule
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [None]:
nlp = spacy.blank("en")
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
token

নেই

In [None]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [None]:
nlp.add_pipe('sentencizer')	 # adding pipeline manually

<spacy.pipeline.sentencizer.Sentencizer at 0x794f103cff50>

In [None]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


### Language Processing Pipeline in Spacy

In [None]:
import spacy

In [None]:
nlp = spacy.blank("en")

doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
america
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [None]:
nlp.pipe_names

[]

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
  print(token,"|",token.pos_,"|",token.lemma_)

Captain | PROPN | Captain
america | PROPN | america
ate | VERB | eat
100 | NUM | 100
$ | NUM | $
of | ADP | of
samosa | PROPN | samosa
. | PUNCT | .
Then | ADV | then
he | PRON | he
said | VERB | say
I | PRON | I
can | AUX | can
do | VERB | do
this | PRON | this
all | DET | all
day | NOUN | day
. | PUNCT | .


In [None]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")
for ent in doc.ents:
    print(ent.text,"|",ent.label_,"|",spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [None]:
from spacy import displacy

displacy.render(doc, style="ent")

### Stemming and Lemmatization

In [None]:
import nltk
import spacy

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [None]:
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in words:
    print(word,"|",stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token,"|",token.lemma_,"|",token.lemma)

eating | eat | 9837207709914848172
eats | eat | 9837207709914848172
eat | eat | 9837207709914848172
ate | eat | 9837207709914848172
adjustable | adjustable | 6033511944150694480
rafting | raft | 7154368781129989833
ability | ability | 11565809527369121409
meeting | meet | 6880656908171229526
better | well | 4525988469032889948


In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
ar = nlp.get_pipe('attribute_ruler')

ar.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")
for token in doc:
    print(token,"|",token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
n't | not
say | say
no | no
! | !
I | I
am | be
exhausted | exhaust


### Part Of Speech POS Tagging

In [None]:
for token in doc:
    print(token,"|",token.pos_,"|",spacy.explain(token.pos_),"|",token.tag_,"|",spacy.explain(token.tag_))

Bro | PROPN | proper noun | NNP | noun, proper singular
, | PUNCT | punctuation | , | punctuation mark, comma
you | PRON | pronoun | PRP | pronoun, personal
wanna | VERB | verb | VBP | verb, non-3rd person singular present
go | VERB | verb | VB | verb, base form
? | PUNCT | punctuation | . | punctuation mark, sentence closer
Brah | PROPN | proper noun | NNP | noun, proper singular
, | PUNCT | punctuation | , | punctuation mark, comma
do | AUX | auxiliary | VBP | verb, non-3rd person singular present
n't | PART | particle | RB | adverb
say | VERB | verb | VB | verb, base form
no | INTJ | interjection | UH | interjection
! | PUNCT | punctuation | . | punctuation mark, sentence closer
I | PRON | pronoun | PRP | pronoun, personal
am | AUX | auxiliary | VBP | verb, non-3rd person singular present
exhausted | VERB | verb | VBN | verb, past participle


In [None]:
doc = nlp("He quits the job")

doc[1].text, doc[1].tag_, doc[1].pos_, spacy.explain(doc[1].tag_)

('quits', 'VBZ', 'VERB', 'verb, 3rd person singular present')

In [None]:
doc = nlp("He quit the job")

doc[1].text, doc[1].tag_, doc[1].pos_, spacy.explain(doc[1].tag_)

('quit', 'VBD', 'VERB', 'verb, past tense')

### Named Entity Recognition (NER)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [None]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billon")

for ent in doc.ents:
  print(ent.text,"|", ent.label_,"|",spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
Twitter Inc | PERSON | People, including fictional
45 | MONEY | Monetary values, including unit


In [None]:
from spacy import displacy

displacy.render(doc,style="ent")

In [None]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [None]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")

for ent in doc.ents:
    print(ent.text,"|", ent.label_, spacy.explain(ent.label_))

from spacy import displacy

displacy.render(doc, style="ent")

Michael Bloomberg | PERSON People, including fictional
Bloomberg | GPE Countries, cities, states
1982 | DATE Absolute or relative dates or periods


## 3.Text Representation in NLP

### Using Bag Of Words (BOW)

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [None]:
df.shape

(5728, 2)

In [None]:
df.spam.value_counts()

Unnamed: 0_level_0,count
spam,Unnamed: 1_level_1
0,4360
1,1368


In [None]:
from re import X
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.spam, test_size=0.2)


In [None]:
X_train[:4]

Unnamed: 0,text
1273,"Subject: localized software , all languages av..."
4715,"Subject: re : contingencies steve , i shall ..."
5272,Subject: re : uk portfolios and books setup in...
2811,Subject: request submitted : access request fo...


In [None]:
y_train[:4]

Unnamed: 0,spam
1273,1
4715,0
5272,0
2811,0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 570759 stored elements and shape (4582, 33880)>

In [None]:
X_train_np = X_train_cv.toarray()
X_train_np[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [None]:
X_test_cv = v.transform(X_test)

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       885
           1       0.97      0.98      0.98       261

    accuracy                           0.99      1146
   macro avg       0.98      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



In [None]:
emails = [
    "You've won a lottery! Provide your bank details to claim your prize.",
    "Hey, can we get together to watch footbal game tomorrow?"
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([1, 0])

In [None]:
# using sklearn pipeline
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       885
           1       0.97      0.98      0.98       261

    accuracy                           0.99      1146
   macro avg       0.98      0.99      0.99      1146
weighted avg       0.99      0.99      0.99      1146



### Stop Words

In [None]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

len(STOP_WORDS)

326

In [None]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")

for token in doc:
    if token.is_stop:
        print(token)

We
just
our
the
part
is


In [None]:
def preprocess(text):
    doc = nlp(text)

    no_stop_words = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return (no_stop_words)

In [None]:
preprocess("Musk wants time to prepare for a trial over his ")

['Musk', 'wants', 'time', 'prepare', 'trial']

In [None]:
import pandas as pd
df = pd.read_json("Doj_press.json", lines=True)
df.shape

(13087, 6)

In [None]:
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components
0,,Convicted Bomb Plotter Sentenced to 30 Years,"PORTLAND, Oregon. – Mohamed Osman Mohamud, 23,...",2014-10-01T00:00:00-04:00,[],[National Security Division (NSD)]
1,12-919,$1 Million in Restitution Payments Announced t...,WASHINGTON – North Carolina’s Waccamaw River...,2012-07-25T00:00:00-04:00,[],[Environment and Natural Resources Division]
2,11-1002,$1 Million Settlement Reached for Natural Reso...,BOSTON– A $1-million settlement has been...,2011-08-03T00:00:00-04:00,[],[Environment and Natural Resources Division]
3,10-015,10 Las Vegas Men Indicted \r\nfor Falsifying V...,WASHINGTON—A federal grand jury in Las Vegas...,2010-01-08T00:00:00-05:00,[],[Environment and Natural Resources Division]
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]


In [None]:
df = df[df["topics"].str.len()!=0]
df.head(5)

Unnamed: 0,id,title,contents,date,topics,components
4,18-898,$100 Million Settlement Will Speed Cleanup Wor...,"The U.S. Department of Justice, the U.S. Envir...",2018-07-09T00:00:00-04:00,[Environment],[Environment and Natural Resources Division]
7,14-1412,14 Indicted in Connection with New England Com...,A 131-count criminal indictment was unsealed t...,2014-12-17T00:00:00-05:00,[Consumer Protection],[Civil Division]
19,17-1419,2017 Southeast Regional Animal Cruelty Prosecu...,The United States Attorney’s Office for the Mi...,2017-12-14T00:00:00-05:00,[Environment],"[Environment and Natural Resources Division, U..."
22,15-1562,21st Century Oncology to Pay $19.75 Million to...,"21st Century Oncology LLC, has agreed to pay $...",2015-12-18T00:00:00-05:00,"[False Claims Act, Health Care Fraud]",[Civil Division]
23,17-1404,21st Century Oncology to Pay $26 Million to Se...,21st Century Oncology Inc. and certain of its ...,2017-12-12T00:00:00-05:00,"[Health Care Fraud, False Claims Act]","[Civil Division, USAO - Florida, Middle]"


In [None]:
df.shape

(4688, 6)

In [None]:
len(df["contents"].iloc[4])

5504

In [None]:
df['contents_new'] = df['contents'].apply(preprocess)
df.head(5)

KeyboardInterrupt: 

### Using Bag Of n-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer(ngram_range=(2,2))
v.fit(["Piyal Chakraborty is looking for a job"])
v.vocabulary_

{'piyal chakraborty': 4,
 'chakraborty is': 0,
 'is looking': 2,
 'looking for': 3,
 'for job': 1}

In [None]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

preprocess("Loki is eating pizza")

'Loki eat pizza'

In [None]:
corpus_processed = [preprocess(text) for text in corpus]
v = CountVectorizer(ngram_range=(1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [None]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [None]:
v.transform(["Hulk eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 0, 0]])

### Using TF-IDF

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes",
    "something is amazing"
]


In [5]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)
print(v.vocabulary_)

{'thor': 27, 'eating': 11, 'pizza': 23, 'loki': 18, 'ironman': 16, 'ate': 8, 'already': 0, 'apple': 6, 'is': 17, 'announcing': 5, 'new': 21, 'iphone': 15, 'tomorrow': 28, 'tesla': 26, 'model': 20, 'google': 13, 'pixel': 22, 'microsoft': 19, 'surface': 25, 'amazon': 3, 'eco': 12, 'dot': 10, 'am': 1, 'biryani': 9, 'and': 4, 'you': 29, 'are': 7, 'grapes': 14, 'something': 24, 'amazing': 2}


In [6]:
all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    indx = v.vocabulary_.get(word)
    print(f"{word} {v.idf_[indx]}")

already 2.504077396776274
am 2.504077396776274
amazing 2.504077396776274
amazon 2.504077396776274
and 2.504077396776274
announcing 1.4054651081081644
apple 2.504077396776274
are 2.504077396776274
ate 2.504077396776274
biryani 2.504077396776274
dot 2.504077396776274
eating 2.09861228866811
eco 2.504077396776274
google 2.504077396776274
grapes 2.504077396776274
iphone 2.504077396776274
ironman 2.504077396776274
is 1.251314428280906
loki 2.504077396776274
microsoft 2.504077396776274
model 2.504077396776274
new 1.4054651081081644
pixel 2.504077396776274
pizza 2.504077396776274
something 2.504077396776274
surface 2.504077396776274
tesla 2.504077396776274
thor 2.504077396776274
tomorrow 1.4054651081081644
you 2.504077396776274


In [7]:
corpus[:2]

['Thor eating pizza, Loki eating pizza, Ironman ate pizza already',
 'Apple is announcing new iphone tomorrow']

In [10]:
transform_output.toarray()[:2]

array([[0.24390607, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.24390607, 0.        ,
        0.        , 0.40882465, 0.        , 0.        , 0.        ,
        0.        , 0.24390607, 0.        , 0.24390607, 0.        ,
        0.        , 0.        , 0.        , 0.73171822, 0.        ,
        0.        , 0.        , 0.24390607, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.3140158 , 0.55947306, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.55947306, 0.        , 0.27957471, 0.        , 0.        ,
        0.        , 0.3140158 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.3140158 , 0.        ]])

In [12]:
import pandas as pd

df = pd.read_csv("ecommerceDataset.csv")
print(df.shape)
df.head()

(50424, 2)


Unnamed: 0,Household,"Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal bliss.so bring home this elegant print that is lushed with rich colors that makes it nothing but sheer elegance to be to your friends and family.it would be treasured forever by whoever your lucky recipient is. Liven up your place with these intriguing paintings that are high definition hd graphic digital prints for home, office or any room."
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
3,Household,Incredible Gifts India Wooden Happy Birthday U...
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...


In [15]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,19312
Books,11820
Electronics,10621
Clothing & Accessories,8671


In [16]:
df['label_num'] = df.label.map({'Household':0,
                                'Books':1,
                                'Electronics':2,
                                'Clothing & Accessories':3})
df.head()

Unnamed: 0,label,text,label_num
0,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ...",0
1,Household,SAF 'UV Textured Modern Art Print Framed' Pain...,0
2,Household,"SAF Flower Print Framed Painting (Synthetic, 1...",0
3,Household,Incredible Gifts India Wooden Happy Birthday U...,0
4,Household,Pitaara Box Romantic Venice Canvas Painting 6m...,0


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text,
                                                    df.label_num, test_size=0.2,
                                                    random_state=2022,
                                                    stratify=df.label_num)


In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])

# Fill NaN values with empty strings
X_train = X_train.fillna('')
X_test = X_test.fillna('')

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      3863
           1       0.97      0.96      0.96      2364
           2       0.97      0.93      0.95      2124
           3       0.96      0.98      0.97      1734

    accuracy                           0.96     10085
   macro avg       0.96      0.96      0.96     10085
weighted avg       0.96      0.96      0.96     10085



In [20]:
X_test[:5]

Unnamed: 0,text
41535,Seagate 4TB Backup Plus Hub USB 3.0 Desktop 3....
9671,Prestige Marvel Glass 3 Burner Gas Stove (Blac...
49629,iVoltaa Next Gen Compact Wired Selfie Stick fo...
33657,TWO DOTS Air Bra for Girls and Women Combo of ...
19316,The Secret of Secrets: The Secrets of the Gold...


In [21]:
y_test[:5]

Unnamed: 0,label_num
41535,2
9671,0
49629,2
33657,3
19316,1


In [22]:
y_pred[:5]

array([2, 0, 2, 3, 1])

In [24]:
from sklearn.naive_bayes import MultinomialNB

clf = Pipeline([
    ('vectorizer_tfidf',TfidfVectorizer()),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      3863
           1       0.97      0.92      0.95      2364
           2       0.97      0.90      0.93      2124
           3       0.98      0.93      0.95      1734

    accuracy                           0.94     10085
   macro avg       0.95      0.93      0.94     10085
weighted avg       0.94      0.94      0.94     10085



In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)


### Using Word Embeddings

### Word vectors in Gensim overview

In [26]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [1]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



In [2]:
wv.similarity(w1="great", w2="good")

0.729151

In [3]:
wv.similarity(w1="great", w2="well")

0.4098271

In [4]:
wv.most_similar("good")

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708),
 ('decent', 0.6837348341941833),
 ('nice', 0.6836092472076416),
 ('excellent', 0.644292950630188),
 ('fantastic', 0.6407778263092041),
 ('better', 0.6120728850364685),
 ('solid', 0.5806034803390503),
 ('lousy', 0.576420247554779)]

In [10]:
wv.doesnt_match(["facebook", "cat", "google", "microsoft"])

'cat'