## Word2Vec: Training from scratch

In [1]:
import pandas as pd

messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t',
                           names=["label", "message"])
#smsspamcolletion is csv file which is separated by tab \t.
# it has 2 features. 1st is label (dependant) and 2nd (after tab) is messages(input feature)

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
messages.shape

(5572, 2)

In [4]:
# to see any one entry: for example 100th row

messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

### Data cleaning

#### 1. Text preprocessing using NLTK
- Remove stop words
- Toeknization
- Stemming
- Lemmatization 

#### 2. Converting Text into Vectors
- Bag of Words
- TF-IDF
- Word2Vec
- Avg Word2Vec

In [5]:
import re
import nltk

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vidieme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [8]:
ps= PorterStemmer()

In [9]:
corpus=[]

In [10]:
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    
    
    
type(review)

str

In [11]:
#review is string. I need to convert into list before removing stop words, lemmatization, stemming etc

In [12]:
#Replace special symbols into spaces
# Change into lower case 

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split() # review is in string, change into list. type of corpus is list
    
    #removing stop words
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [13]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

#### Converting words into vectors
#### 1. Bag of Words

In [14]:

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary= True)
X = cv.fit_transform(corpus).toarray()

#max_features-> top max occuring features. top 2500

In [15]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
X[1]# features wrt sentence 1

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
X.shape #2500 features as I have mentioned in parameters

(5572, 2500)

#### Label encoding for y 

In [18]:
# Label encoding to y features: as of now it is pam and ham
#changing into 1 and 0 

y=pd.get_dummies(messages['label'])

#get_dummies() It converts categorical data into dummy or indicator variables. 

In [19]:
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [20]:
y=y.iloc[:,1].values

In [21]:
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

#### Train test split

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [23]:
X_train, y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([0, 0, 0, ..., 1, 0, 0], dtype=uint8))

#### Applying Naive Bais Algo
- class sklearn.naive_bayes.MultinomialNB
- The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). 
- The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)
# I have applied algo 

In [26]:
type(spam_detect_model)

sklearn.naive_bayes.MultinomialNB

#### Prediction after applying model

In [27]:
y_pred=spam_detect_model.predict(X_test)

In [28]:
y_pred

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

### Accuracy

In [29]:
from sklearn.metrics import accuracy_score,classification_report

In [30]:
score=accuracy_score(y_test,y_pred)
print(score)

# Score= 98%

0.9856502242152466


In [31]:
# Classification Report

from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       961
           1       0.93      0.97      0.95       154

    accuracy                           0.99      1115
   macro avg       0.96      0.98      0.97      1115
weighted avg       0.99      0.99      0.99      1115



### Creating TF-IDF model

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500,ngram_range=(1,2))
X = tv.fit_transform(corpus).toarray()

In [33]:
# train test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [34]:
#fitting TFIDF data into multinomial Naive Bias
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [35]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [36]:
score=accuracy_score(y_test,y_pred)
print(score)

# it should be increased. In this case score is decreased by 1%

0.97847533632287


In [37]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       979
           1       0.85      1.00      0.92       136

    accuracy                           0.98      1115
   macro avg       0.93      0.99      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [38]:
#using random forest

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

classifier.fit(X_train, y_train)

RandomForestClassifier()

In [39]:
y_pred= classifier.predict(X_test)

In [40]:
accuracy_score(y_pred, y_test)

0.9847533632286996

In [41]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       972
           1       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



## Average Word2Vec

In [42]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess


In [43]:
corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [46]:
#simple_preprocess()
#convert doc into lower case sentence wise rather than word wise as we did before
#or I can give sent.lower()

words=[]
for sent in corpus:
    sent_token= sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [47]:
words
#unique words presnet in every sentences

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
  'prize',
  'reward',
  

### to create word2vec from scratch


In [49]:
import gensim

### Creating Word2Vec

In [80]:


model= gensim.models.Word2Vec(words, window=5, min_count=2, vector_size=100)
# 1st arg is sentences
# by default my dimension is 100 (vector_size arg) in gensim model
# window size is 5, when we use CBOW or skipgram: understand context of lhs and rhs
# min count: ignores all the wprds whose total frequency is this, in this case 2
# that is, words which have occured less than 2 those are ignored. 

# all these are hyper parameters. 

In [81]:
model.wv.index_to_key

# all vocabulary 

['call',
 'go',
 'get',
 'ur',
 'gt',
 'lt',
 'come',
 'day',
 'ok',
 'free',
 'know',
 'love',
 'like',
 'time',
 'want',
 'good',
 'got',
 'text',
 'send',
 'txt',
 'need',
 'one',
 'today',
 'take',
 'stop',
 'see',
 'home',
 'think',
 'repli',
 'lor',
 'sorri',
 'still',
 'tell',
 'mobil',
 'back',
 'da',
 'dont',
 'make',
 'phone',
 'pleas',
 'week',
 'hi',
 'say',
 'work',
 'new',
 'later',
 'pl',
 'hope',
 'miss',
 'ask',
 'co',
 'msg',
 'min',
 'meet',
 'messag',
 'dear',
 'night',
 'happi',
 'wait',
 'well',
 'give',
 'thing',
 'tri',
 'great',
 'much',
 'oh',
 'claim',
 'wat',
 'hey',
 'number',
 'friend',
 'thank',
 'way',
 'ye',
 'www',
 'let',
 'prize',
 'feel',
 'right',
 'even',
 'tomorrow',
 'win',
 'pick',
 'alreadi',
 'tone',
 'care',
 'cash',
 'said',
 'amp',
 'im',
 'leav',
 'yeah',
 'realli',
 'find',
 'babe',
 'life',
 'morn',
 'last',
 'sleep',
 'keep',
 'uk',
 'year',
 'servic',
 'nokia',
 'sure',
 'anyth',
 'com',
 'buy',
 'use',
 'would',
 'start',
 'contact',

In [82]:
model.corpus_count

5564

In [83]:
model.epochs

5

In [84]:
model.wv.similar_by_word('prize')

[('claim', 0.9993270039558411),
 ('show', 0.9990811944007874),
 ('line', 0.9990561604499817),
 ('cash', 0.9990326762199402),
 ('award', 0.9990260004997253),
 ('land', 0.9989732503890991),
 ('call', 0.9989001154899597),
 ('guarante', 0.998888373374939),
 ('contact', 0.9987936615943909),
 ('draw', 0.9987353086471558)]

In [57]:
model.wv.similar_by_word('Whisky') # whisky is not in my vocabulary list

KeyError: "Key 'Whisky' not present"

In [85]:
model.wv.similar_by_word('love')

[('one', 0.9997818470001221),
 ('much', 0.9997800588607788),
 ('amp', 0.9997643232345581),
 ('need', 0.9997639656066895),
 ('know', 0.9997488260269165),
 ('feel', 0.999742865562439),
 ('day', 0.9997408986091614),
 ('see', 0.999736487865448),
 ('great', 0.9997352361679077),
 ('hey', 0.9997348785400391)]

In [86]:
model.wv.similar_by_word('hope')

[('make', 0.9997502565383911),
 ('amp', 0.999727725982666),
 ('much', 0.9997222423553467),
 ('even', 0.9997158050537109),
 ('love', 0.9997060894966125),
 ('one', 0.999704897403717),
 ('well', 0.9996997117996216),
 ('come', 0.9996938705444336),
 ('day', 0.9996908903121948),
 ('realli', 0.9996875524520874)]

### This is how I have created Word2Vec

In [87]:
vec_prize= model.wv["prize"]
vec_prize
#this has 1000D this means, sentence which has prize has 10 words and each has 100 vectors
#hence total 1K

array([-4.02185768e-01,  4.50449735e-01,  1.35718659e-01, -5.13916835e-04,
        5.59742264e-02, -7.42697716e-01,  1.55290395e-01,  8.66821289e-01,
       -3.04508835e-01, -3.15423787e-01, -2.77953327e-01, -5.98715842e-01,
       -1.79450542e-01,  4.74428497e-02,  2.11281419e-01, -2.59484202e-01,
        2.65139174e-02, -5.02135277e-01,  4.04362530e-02, -8.99677038e-01,
        2.90565431e-01,  2.78807104e-01,  3.89328040e-02, -1.33936420e-01,
       -2.34710962e-01,  1.06480151e-01, -4.51771379e-01, -4.23293978e-01,
       -2.70217240e-01,  1.85181141e-01,  4.38133895e-01,  1.16578914e-01,
       -6.33895025e-03, -2.33523220e-01, -2.97728598e-01,  4.70957249e-01,
       -1.04136668e-01, -3.52541775e-01, -6.49156123e-02, -7.25777507e-01,
        1.28246397e-01, -2.69180387e-01, -1.15322493e-01,  6.36564121e-02,
        3.43681335e-01, -1.55260161e-01, -3.21301967e-01, -9.82321650e-02,
        3.33096027e-01,  3.99331033e-01,  4.11060303e-02, -4.01653618e-01,
       -1.30694643e-01, -

### Now applying Average word2Vec

- vector_size argument in Word2Vec() is by default 100. 
- This means each word has 100D vector as shown in last section
- As this is huge and computational expensive, we will try to reduce size by averaging the vectors in each sentence, so that each sentence has 100D vector rather than each word in that sentence. 


In [73]:
import numpy as np

In [88]:
#creating function 
#calculate model.wv[word] and
# calculate mean of that word only if word is present in index_to_key (vocab list)

def avg_word2vec(doc):
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key], axis= 0)

#this use CBOW

#it removes out of vocabulary words
# sent= [word for word in doc if word in model.wv.index_to_key]

In [89]:
!pip install tdqm
#tqdm module allows for the generation of progress bars in Python. 
#The name is derived from the Arabic word, “taqaddum,” which translates as “progress.”
# this is the bar which shows progress like 10% increase to 70 90 100.. 




In [90]:
from tqdm import tqdm

In [91]:
words[73]

['perform']

In [92]:
words[90]

['yeah', 'stand', 'close', 'tho', 'catch', 'someth']

In [93]:
type(model.wv.index_to_key)

list

In [94]:
#applying Average word2vec function on each sentences that are present in words. 
words

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'question',
  'std',
  'txt',
  'rate',
  'appli'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
  'prize',
  'reward',
  

In [95]:
X=[] #create empty list 

for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|████████████████████████████████████████████████████████████████████████████| 5564/5564 [00:00<00:00, 6038.28it/s]


In [96]:
type(X) # as X is list, Let me convert this to array 

list

In [97]:
#X_new is my input feature

X_new= np.array(X)

  X_new= np.array(X)


In [99]:
X_new[0] #first sentence

array([-0.21813224,  0.24843091,  0.06099826, -0.00165417,  0.04416313,
       -0.42887047,  0.09013149,  0.49663517, -0.18018387, -0.19565046,
       -0.14305048, -0.3539252 , -0.1037832 ,  0.00812788,  0.106863  ,
       -0.15136993,  0.01721031, -0.30284068,  0.02463922, -0.52570397,
        0.18089384,  0.15697713,  0.02861671, -0.10078231, -0.13433093,
        0.05915466, -0.2464479 , -0.23975702, -0.1730606 ,  0.1008155 ,
        0.26567468,  0.07673643,  0.02683468, -0.1477121 , -0.17016141,
        0.27752995, -0.06534693, -0.21420418, -0.06379408, -0.42791203,
        0.08081   , -0.20205802, -0.06639843,  0.05780741,  0.2037528 ,
       -0.10528388, -0.18377785, -0.05834742,  0.19054097,  0.2260568 ,
        0.02889698, -0.2528446 , -0.08151831,  0.00970408, -0.16206008,
        0.1336651 ,  0.14024864, -0.0097893 , -0.30026433,  0.05434879,
        0.07177395,  0.19802295, -0.14401798, -0.06203267, -0.29057965,
        0.15307291,  0.0828931 ,  0.19388537, -0.21267115,  0.27

In [100]:
X_new[0].shape # this has 100 dimension

(100,)

In [102]:
# checking: this is 2nd sentence
words[1]

['ok', 'lar', 'joke', 'wif', 'oni']

In [104]:
#dimensions created fr 2nd sentence 
X_new[1]

array([-0.17380303,  0.19104062,  0.04686403, -0.00448557,  0.04009185,
       -0.33535498,  0.07716765,  0.38629708, -0.13950846, -0.14658195,
       -0.11110125, -0.27871114, -0.07885485,  0.00401136,  0.08900166,
       -0.121962  ,  0.01259315, -0.23792751,  0.01592175, -0.40975326,
        0.14206842,  0.12313727,  0.0251171 , -0.07866804, -0.09935243,
        0.04626891, -0.19556025, -0.18791597, -0.1361802 ,  0.07733121,
        0.21060649,  0.05864396,  0.02277444, -0.11016295, -0.1355041 ,
        0.21490154, -0.05534313, -0.16644113, -0.04369146, -0.3359596 ,
        0.06529504, -0.1571442 , -0.04908994,  0.04053568,  0.16083315,
       -0.08552096, -0.1394999 , -0.042547  ,  0.15029094,  0.17358002,
        0.02610753, -0.19845656, -0.06568725,  0.00810429, -0.1272393 ,
        0.10115224,  0.11074144, -0.01136414, -0.23375168,  0.04294322,
        0.04942609,  0.1509769 , -0.10937177, -0.04786009, -0.22626689,
        0.11696692,  0.06222355,  0.15352443, -0.17081815,  0.21

In [105]:
#to see all my input features
X_new

array([array([-0.21813224,  0.24843091,  0.06099826, -0.00165417,  0.04416313,
              -0.42887047,  0.09013149,  0.49663517, -0.18018387, -0.19565046,
              -0.14305048, -0.3539252 , -0.1037832 ,  0.00812788,  0.106863  ,
              -0.15136993,  0.01721031, -0.30284068,  0.02463922, -0.52570397,
               0.18089384,  0.15697713,  0.02861671, -0.10078231, -0.13433093,
               0.05915466, -0.2464479 , -0.23975702, -0.1730606 ,  0.1008155 ,
               0.26567468,  0.07673643,  0.02683468, -0.1477121 , -0.17016141,
               0.27752995, -0.06534693, -0.21420418, -0.06379408, -0.42791203,
               0.08081   , -0.20205802, -0.06639843,  0.05780741,  0.2037528 ,
              -0.10528388, -0.18377785, -0.05834742,  0.19054097,  0.2260568 ,
               0.02889698, -0.2528446 , -0.08151831,  0.00970408, -0.16206008,
               0.1336651 ,  0.14024864, -0.0097893 , -0.30026433,  0.05434879,
               0.07177395,  0.19802295, -0.14401798,

In [None]:
# Train test split 
# Apply model 

In [106]:
y # ouput feature and X_new is input feature 
#train test split this and apply model

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [None]:
#assignment : IMDB dataset for 50K movie review kaggle