# Vectorizer

In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
df = pd.read_excel('data.xlsx')

In [4]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


# Count Vectorizer

In [5]:
cv = CountVectorizer()

In [7]:
x = cv.fit_transform(df['test'])

In [8]:
x

<4x14 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [9]:
x.toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]], dtype=int64)

In [10]:
df.head()

Unnamed: 0,test,class
0,I love Bangladesh,1
1,Could you give me an iphone?,0
2,Hello how are you?,1
3,I want to talk you.,1


In [11]:
df3 = df.copy()

In [17]:
df2 = pd.DataFrame(x.toarray(), index = df['test'], columns=cv.get_feature_names_out() )

In [18]:
df2.head()

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0,0,1,0,0,0,0,0,1,0,0,0,0,0
Could you give me an iphone?,1,0,0,1,1,0,0,1,0,1,0,0,0,1
Hello how are you?,0,1,0,0,0,1,1,0,0,0,0,0,0,1
I want to talk you.,0,0,0,0,0,0,0,0,0,0,1,1,1,1


In [19]:
columns = cv.get_feature_names_out()

In [20]:
columns

array(['an', 'are', 'bangladesh', 'could', 'give', 'hello', 'how',
       'iphone', 'love', 'me', 'talk', 'to', 'want', 'you'], dtype=object)

# TF_IDF vectorizer

In [21]:
idf = TfidfVectorizer()

In [22]:
x = idf.fit_transform(df3['test'])

In [23]:
x

<4x14 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [24]:
x.toarray()

array([[0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.70710678, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.43003652, 0.        , 0.        , 0.43003652, 0.43003652,
        0.        , 0.        , 0.43003652, 0.        , 0.43003652,
        0.        , 0.        , 0.        , 0.27448674],
       [0.        , 0.5417361 , 0.        , 0.        , 0.        ,
        0.5417361 , 0.5417361 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.34578314],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5417361 , 0.5417361 , 0.5417361 , 0.34578314]])

In [27]:
df4 = pd.DataFrame(x.toarray(), index= df3['test'],columns=idf.get_feature_names_out())

In [28]:
df4.head()

Unnamed: 0_level_0,an,are,bangladesh,could,give,hello,how,iphone,love,me,talk,to,want,you
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
I love Bangladesh,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0
Could you give me an iphone?,0.430037,0.0,0.0,0.430037,0.430037,0.0,0.0,0.430037,0.0,0.430037,0.0,0.0,0.0,0.274487
Hello how are you?,0.0,0.541736,0.0,0.0,0.0,0.541736,0.541736,0.0,0.0,0.0,0.0,0.0,0.0,0.345783
I want to talk you.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541736,0.541736,0.541736,0.345783


# Word2Vec

In [29]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.1-cp311-cp311-win_amd64.whl (23.9 MB)
                                              0.0/23.9 MB ? eta -:--:--
                                              0.0/23.9 MB ? eta -:--:--
                                              0.1/23.9 MB 1.3 MB/s eta 0:00:19
                                              0.2/23.9 MB 1.6 MB/s eta 0:00:15
                                              0.2/23.9 MB 2.1 MB/s eta 0:00:12
                                              0.4/23.9 MB 2.3 MB/s eta 0:00:11
                                              0.5/23.9 MB 2.4 MB/s eta 0:00:10
     -                                        0.7/23.9 MB 3.0 MB/s eta 0:00:08
     -                                        1.1/23.9 MB 4.2 MB/s eta 0:00:06
     --                                       1.3/23.9 MB 4.4 MB/s eta 0:00:06
     --                                       1.3/23.9 MB 4.4 MB/s eta 0:00:06
     --                                       1.3/23.9 MB 4.4 MB

In [37]:
import gensim
from gensim.models import Word2Vec
import nltk

In [38]:
from nltk.tokenize import word_tokenize

In [39]:
text_vsc = [nltk.word_tokenize(test) for test in df['test']]

In [40]:
text_vsc

[['I', 'love', 'Bangladesh'],
 ['Could', 'you', 'give', 'me', 'an', 'iphone', '?'],
 ['Hello', 'how', 'are', 'you', '?'],
 ['I', 'want', 'to', 'talk', 'you', '.']]

In [42]:
#another way
text=[]
for test in df['test']:
    text.append(nltk.word_tokenize(test))

In [43]:
text

[['I', 'love', 'Bangladesh'],
 ['Could', 'you', 'give', 'me', 'an', 'iphone', '?'],
 ['Hello', 'how', 'are', 'you', '?'],
 ['I', 'want', 'to', 'talk', 'you', '.']]

In [44]:
model = Word2Vec(text,min_count=1)

In [45]:
model.wv.most_similar('Hello')

[('?', 0.17272651195526123),
 ('Bangladesh', 0.16695065796375275),
 ('give', 0.11118055880069733),
 ('talk', 0.10947782546281815),
 ('you', 0.07967711985111237),
 ('an', 0.04130832105875015),
 ('me', 0.037714049220085144),
 ('to', 0.0132435392588377),
 ('I', 0.008316040970385075),
 ('love', -0.005900950636714697)]