#### Libaries 📚

In [100]:
import gzip   # compression libary
import gensim
import logging
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models.fasttext import FastText


logging.basicConfig(format='%(asctime)s : %(levelname)s :')

### *Data*

In [43]:
!git clone https://github.com/naoufal2807/Word-Embeddings.git

fatal: destination path 'Word-Embeddings' already exists and is not an empty directory.


In [44]:
!cd Word-Embeddings/

In [51]:
file_path = 'Word-Embeddings/text.txt'

data = pd.read_csv(file_path, sep=']', header=None)



In [58]:
#drop the tag column

data_file = data.drop(columns=[0]) # column index 0

data_file[1]


0        During the period of falling in love, each ti...
1              When I was involved in a traffic accident.
2        When I was driving home after  several days o...
3        When I lost the person who meant the most to me.
4        The time I knocked a deer down - the sight of...
                              ...                        
7475     Two years back someone invited me to be the t...
7476     I had taken the responsibility to do somethin...
7477     I was at home and I heard a loud sound of spi...
7478     I did not do the homework that the teacher ha...
7479     I had shouted at my younger brother and he wa...
Name: 1, Length: 7480, dtype: object

In [62]:
## Building the corpus


def tokenize_text(text):
    return simple_preprocess(text)

# Applying tokenization to all rows in 'text' column
data_file['tokens'] = data_file[1].apply(tokenize_text)
print("\nDataFrame with Tokens:")





DataFrame with Tokens:


Unnamed: 0,1,tokens
0,"During the period of falling in love, each ti...","[during, the, period, of, falling, in, love, e..."
1,When I was involved in a traffic accident.,"[when, was, involved, in, traffic, accident]"
2,When I was driving home after several days o...,"[when, was, driving, home, after, several, day..."
3,When I lost the person who meant the most to me.,"[when, lost, the, person, who, meant, the, mos..."
4,The time I knocked a deer down - the sight of...,"[the, time, knocked, deer, down, the, sight, o..."
...,...,...
7475,Two years back someone invited me to be the t...,"[two, years, back, someone, invited, me, to, b..."
7476,I had taken the responsibility to do somethin...,"[had, taken, the, responsibility, to, do, some..."
7477,I was at home and I heard a loud sound of spi...,"[was, at, home, and, heard, loud, sound, of, s..."
7478,I did not do the homework that the teacher ha...,"[did, not, do, the, homework, that, the, teach..."


In [67]:
# converting the tokens into list
tokens = data_file['tokens'].to_list()


['during',
 'the',
 'period',
 'of',
 'falling',
 'in',
 'love',
 'each',
 'time',
 'that',
 'we',
 'met',
 'and',
 'especially',
 'when',
 'we',
 'had',
 'not',
 'met',
 'for',
 'long',
 'time']

In [112]:
model_cbow = gensim.models.Word2Vec (tokens, vector_size= 150 , window = 5 , min_count=2 , workers= 15, sg=0) # sg : 0 -> for Bag-of-words
model_cbow.train(tokens,total_examples=len(tokens),epochs=10)



(1019909, 1486170)

### Test 1 : similarity



In [113]:
# looking for words similar to w1
w1 = "class"
model_cbow.wv.most_similar (positive=w1)

[('mates', 0.7163353562355042),
 ('leader', 0.7153098583221436),
 ('college', 0.7054522037506104),
 ('list', 0.7047439217567444),
 ('student', 0.7029987573623657),
 ('term', 0.6980636119842529),
 ('institute', 0.6935988068580627),
 ('paper', 0.6932812333106995),
 ('library', 0.6862901449203491),
 ('english', 0.6850241422653198)]

In [107]:
# similarity between two different words
model_cbow.wv.similarity(w1="term",w2="student")

0.73035157

In [108]:
# looking for words not similar to w2
w2 = "study"
model_cbow.wv.most_similar(negative=w2)


[('saw', 0.30866482853889465),
 ('old', 0.264975905418396),
 ('accident', 0.258465051651001),
 ('by', 0.24145564436912537),
 ('died', 0.18612894415855408),
 ('design', 0.1835113912820816),
 ('repressed', 0.1829172819852829),
 ('suddenly', 0.17492665350437164),
 ('lost', 0.14677149057388306),
 ('fell', 0.13247248530387878)]

In [109]:
# finding the odd word that should be excluded from the list

words = ["old", "class", "study","job", "fire"]

model_cbow.wv.doesnt_match(words)

'old'

### Test 2 :  FastText & Skipgram & Cbow

In [110]:
# training the FastText model
model_subword = FastText(tokens, vector_size=150, window=10, min_count=2, workers=10, min_n=3, max_n=6)  # instantiate
%time model_subword.train(tokens,total_examples=len(tokens),epochs=10)



CPU times: user 31.2 s, sys: 110 ms, total: 31.3 s
Wall time: 23.5 s


(1020092, 1486170)

In [114]:
# training the skipgram model

model_skipgram = gensim.models.Word2Vec (tokens, vector_size= 150 , window = 5 , min_count=2 , workers= 15, sg=1)
model_skipgram.train(tokens,total_examples=len(tokens),epochs=10)



(1019766, 1486170)

In [134]:
# subword model
w4 = 'class'

model_subword.wv.most_similar(positive=w4)

[('classes', 0.954501211643219),
 ('glass', 0.9263764023780823),
 ('tea', 0.880334734916687),
 ('mass', 0.8790790438652039),
 ('teach', 0.8603469729423523),
 ('student', 0.8547466993331909),
 ('classroom', 0.8485536575317383),
 ('students', 0.845920979976654),
 ('classmates', 0.8416107892990112),
 ('teams', 0.8412540555000305)]

In [135]:
# skipgram
w4 = 'class'

model_skipgram.wv.most_similar(positive=w4)

[('mates', 0.7204114198684692),
 ('lesson', 0.6717144846916199),
 ('composition', 0.6553208231925964),
 ('computer', 0.6436668038368225),
 ('mathematics', 0.6328681111335754),
 ('march', 0.6183663606643677),
 ('leader', 0.6178949475288391),
 ('duty', 0.6158401966094971),
 ('anatomy', 0.6066087484359741),
 ('copying', 0.6004791855812073)]