<a href="https://colab.research.google.com/github/mehrn79/npl_wrod2wec/blob/main/NLP_2wec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Word2wec implementation using skip-grams**

##**Preprocessing**

### **librarys for preprocessing**

In [None]:
pip install hazm

In [8]:
from __future__ import unicode_literals
from hazm import *
from keras.preprocessing import text
from keras.preprocessing.sequence import skipgrams

### **getting our corpes and stop words**

In [69]:
with open('/content/shams.txt') as f:
  lines = f.readlines()

with open('/content/stopwords.txt') as file:
  stopLines = file.readlines()
stopWord = [item.replace('\n',"") for item in stopLines]
  

### **cleaning our corpes by stop words**

In [91]:
words= []
for sent in lines :
   words.append(word_tokenize(sent))

corpes=[]
for wordSent in words :
  cleanSent=[]
  for word in wordSent :
    if word not in stopWord :
      cleanSent.append(word)
  if(len(cleanSent)>1) :
    corpes.append(' '.join(cleanSent))





رضای آدم گریست سیصد خنده وصلش گشاده گشت دهن


### **tokenize our corpes**

In [92]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(corpes)

word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}

vocab_size = len(word2id) + 1 
embed_size = 100

sentencesID = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpes]


### **generate skip-grams**

In [93]:
skip_grams = [skipgrams(sentID, vocabulary_size=vocab_size, window_size=2) for sentID in sentencesID]


## **Build the skip-gram model architecture**

### **librarys for skip-gram model architecture**

In [94]:
 from keras.preprocessing.sequence import skipgrams 
 from keras.layers import *
 from keras.layers.core import Dense, Reshape
 from keras.layers.embeddings import Embedding
 from keras.models import Model,Sequential 
 import numpy as np

### **implementation**

In [95]:
targetWord_model = Sequential()
targetWord_model.add(Embedding(vocab_size, embed_size,
                         embeddings_initializer="glorot_uniform",
                         input_length=1))
targetWord_model.add(Reshape((embed_size, )))

contextWord_model = Sequential()
contextWord_model.add(Embedding(vocab_size, embed_size,
                  embeddings_initializer="glorot_uniform",
                  input_length=1))
contextWord_model.add(Reshape((embed_size,)))



merged_output = add([targetWord_model.output, contextWord_model.output]) 
model_combined = Sequential()
model_combined.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))
final_model = Model([targetWord_model.input, contextWord_model.input], model_combined(merged_output))
final_model.compile(loss="mean_squared_error", optimizer="Adam")
final_model.summary()

Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 embedding_30_input (InputLayer  [(None, 1)]         0           []                               
 )                                                                                                
                                                                                                  
 embedding_31_input (InputLayer  [(None, 1)]         0           []                               
 )                                                                                                
                                                                                                  
 embedding_30 (Embedding)       (None, 1, 100)       983400      ['embedding_30_input[0][0]']     
                                                                                           

## **train the model**

In [97]:

for epoch in range(1, 50):
     loss = 0
     for i, elem in enumerate(skip_grams):
         pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
         pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
         labels = np.array(elem[1], dtype='int32')
         X = [pair_first_elem, pair_second_elem]
         Y = labels
         if i % 10000 == 0:
             print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
         loss += final_model.train_on_batch(X,Y)  

     print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 1047.8232188560069
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 909.7456455137581
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 822.9532449673861
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 4 Loss: 756.4855859400705
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 5 Loss: 712.5904396534897
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 6 Loss: 681.6832656539045
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 7 Loss: 659.4706270527095
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 8 Loss: 641.9601009683684
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 9 Loss: 627.7781069034245
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 10 Loss: 616.2313599949703
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 11 Loss: 607.2489358967869
Processed 0 (skip_first, skip

KeyboardInterrupt: ignored

## **getting word embeddings**

In [98]:
import pandas as pd                     
from scipy.sparse import csr_matrix     
%matplotlib inline
word_embed_layer = targetWord_model.layers[0]
weights = word_embed_layer.get_weights()[0][1:]
print(weights.shape)
pd.DataFrame(weights, index=id2word.values()).head()




(9833, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
جان,-0.053558,-0.163965,-0.050072,-0.174307,0.020303,0.023441,-0.012687,0.159729,-0.090239,-0.108262,...,-0.187364,-0.010242,0.126306,0.054474,-0.136347,-0.011369,0.072804,0.080109,-0.167982,0.002764
دل,0.077389,-0.12555,0.038674,-0.024768,-0.131657,0.106669,0.147769,-0.071099,0.129209,0.020927,...,-0.058013,0.12352,-0.02046,0.094458,0.080051,0.007871,-0.02251,0.096668,-0.038876,0.233341
عشق,-0.008767,-0.191138,-0.127382,-0.086471,0.141952,0.032412,-0.083442,0.073681,-0.06822,-0.069479,...,-0.277621,0.042796,0.127504,0.021125,0.033599,0.204628,-0.087292,0.067262,-0.211242,-0.015171
جهان,-0.201111,0.043621,0.125963,0.159922,-0.2775,-0.002178,-0.023111,-0.042849,-0.181581,-0.113298,...,-0.001987,0.097173,0.002007,-0.132032,-0.230522,-0.171756,0.196306,-0.051702,-0.193303,0.223304
اندر,0.029862,0.031981,-0.081911,0.060354,0.072531,-0.018876,0.10468,-0.045227,0.00912,-0.139848,...,0.00452,0.301435,0.100419,0.176141,0.126837,0.106736,0.292485,-0.060127,-0.082429,0.010457


In [100]:
from sklearn.metrics.pairwise import euclidean_distances

distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:15]+1] 
                   for search_term in ['جان','عشق','دل']}

similar_words

(9833, 9833)


{'جان': ['تسلیم',
  'بعث',
  'نالان',
  'مورش',
  'معکم',
  'اصطفای',
  'عشق',
  'ذاتی',
  'پرعسل',
  'قید',
  'معارف',
  'نامدشان',
  'قبیحی',
  'منزلش'],
 'دل': ['المستغاث',
  'بالطفی',
  'دمیده',
  'جراره',
  'نالان',
  'درغژم',
  'رمیدستی',
  'مردفکن',
  'یرکن',
  'افلاکی',
  'القفا',
  'پهنا',
  'بارم',
  'بودم'],
 'عشق': ['نقدر',
  'صبغنا',
  'میذن',
  'عنینه',
  'ملاقات',
  'القمر',
  'آفرید',
  'کربلا',
  'بالاحسان',
  'افزایش',
  'ششه',
  'تمنن',
  'بوالفتوحی',
  'ناسیه']}