<a href="https://colab.research.google.com/github/pickledherring/NLP-group/blob/main/Added_code_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [247]:
from google.colab import auth
from googleapiclient.discovery import build
from io import FileIO
from googleapiclient.http import MediaIoBaseDownload

import pandas as pd
import nltk
from sklearn.model_selection import train_test_split

try:
  stop_words = nltk.corpus.stopwords.words("english")
except:
  nltk.download('stopwords')
  stop_words = nltk.corpus.stopwords.words("english")

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import CosineSimilarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import torch
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [248]:
#  Get the files from google drive
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Get english train data file
file_id = '1m3Ax9Z8OHMU-7FqraKc-ddI3YQ7yY_Q6'  # file id on the Google Drive
downloaded = FileIO("en.trial.complete.json", 'w')
request = drive_service.files().get_media(fileId=file_id)
downloader = MediaIoBaseDownload(downloaded, request)
done = False
while done is False:
  status, done = downloader.next_chunk()
  print("Download {}%.".format(int(status.progress() * 100)))


Download 100%.


In [377]:
en_df = pd.read_json("en.trial.complete.json")
en_df.head()

200


In [250]:
en_df.dtypes

id            object
word          object
pos           object
gloss         object
example       object
type          object
counts         int64
f_rnk          int64
concrete       int64
polysemous     int64
sgns          object
char          object
electra       object
dtype: object

In [251]:
en_df[en_df.word.duplicated()]

Unnamed: 0,id,word,pos,gloss,example,type,counts,f_rnk,concrete,polysemous,sgns,char,electra
66,en.trial.67,divorce,verb,To legally dissolve a marriage between two peo...,"A ship captain can marry couples , but can not...",paraphrastic,21171,3773,0,0,"[1.579477787, -1.7040250301, 2.4623465538, -0....","[-0.09191438560000001, -0.0440451615, -0.45280...","[0.0899147466, 0.0271891933, 0.023069024100000..."
188,en.trial.189,forthwith,adverb,Without delay ; immediately .,"Then Proclamation was made , that they that ha...",synonym/antonym-based,1179,31462,0,1,"[0.7841670513, 0.6406752467, -1.0482375622, -0...","[-8.645650000000001e-05, 0.161273554, 0.082766...","[-1.0183763504, -0.3864063621, 0.0048319194, -..."


we have words to disambiguate! forthwith looks to be the same with a different example though. <br>thoughts on disambiguation (feel free to add):<br><br> need to gather contex, but all I can think of is to train the NN to find the part of speech of the word and maybe type. I think we are only given the gloss and from that predict the embedding, so we (maybe) can't use the rest of it except in training. probably a good idea regardless of the disambiguation goal. <br><br> we could also retrain the model on incorrect words that are polysemous, like a boosting method. <br><br> lastly, disambiguation might not be necessary given the gloss and the vector might not be so similar. should look at the distance between these words in the embeddings. glosses look different, but I can see the top few elements of these two polysemous words are similar in all of the embeddings.

In [252]:
en_df[(en_df.word == "divorce") | (en_df.word == "forthwith")]

Unnamed: 0,id,word,pos,gloss,example,type,counts,f_rnk,concrete,polysemous,sgns,char,electra
50,en.trial.51,divorce,verb,To separate something that was connected .,The radical group voted to divorce itself from...,hypernym-based,21171,3773,0,0,"[1.579477787, -1.7040250301, 2.4623465538, -0....","[-0.09191438560000001, -0.0440451615, -0.45280...","[-0.6520454288, -0.1928912848, 0.0298318155, -..."
66,en.trial.67,divorce,verb,To legally dissolve a marriage between two peo...,"A ship captain can marry couples , but can not...",paraphrastic,21171,3773,0,0,"[1.579477787, -1.7040250301, 2.4623465538, -0....","[-0.09191438560000001, -0.0440451615, -0.45280...","[0.0899147466, 0.0271891933, 0.023069024100000..."
160,en.trial.161,forthwith,adverb,Without delay ; immediately .,"Let ther be Light , said God , and forthwith L...",synonym/antonym-based,1179,31462,0,1,"[0.7841670513, 0.6406752467, -1.0482375622, -0...","[-8.645650000000001e-05, 0.161273554, 0.082766...","[-0.9691627026, -0.20675842460000002, 0.184671..."
188,en.trial.189,forthwith,adverb,Without delay ; immediately .,"Then Proclamation was made , that they that ha...",synonym/antonym-based,1179,31462,0,1,"[0.7841670513, 0.6406752467, -1.0482375622, -0...","[-8.645650000000001e-05, 0.161273554, 0.082766...","[-1.0183763504, -0.3864063621, 0.0048319194, -..."


clean glosses of punctuation, stopwords, duplicates, turn into lists.

In [253]:
def clean(gloss):
  tokenizer = nltk.RegexpTokenizer(r"\w+")
  cleaned = tokenizer.tokenize(gloss)
  cleaned = list(set([word.lower() for word in cleaned]))
  cleaned = [word for word in cleaned if not word in stop_words]
  return cleaned

gloss_lists = en_df.gloss.apply(clean)
gloss_lists

0                                      [pleasant, clear]
1                          [things, mixture, substances]
2                             [institution, established]
3                        [domestic, especially, servant]
4                              [find, try, search, look]
                             ...                        
195                        [plant, animal, color, cells]
196                                [seeming, appearance]
197       [vehicle, proceed, travel, permission, person]
198                      [sitting, around, much, moving]
199    [heraldic, word, part, forming, sentence, phra...
Name: gloss, Length: 200, dtype: object

In [254]:
# list of all context words
context_voc = []
for i in range(len(gloss_lists)):
  for j in range(len(gloss_lists[i])):
    if not gloss_lists[i][j] in context_voc:
      context_voc.append(gloss_lists[i][j])

based on an SGNS following this guide: https://medium.com/towards-datascience/word2vec-negative-sampling-made-easy-7a1a647e07a4


In [255]:
# true context words for each defined word (center word)
trues = []
for i in range(len(gloss_lists)):
  for j in range(len(gloss_lists[i])):
    index = context_voc.index(gloss_lists[i][j])
    # append index of center in gloss_lists, index of context in context_voc, and 1 for true
    trues.append([i, index, 1])

In [256]:
trues[:5]

[[0, 0, 1], [0, 1, 1], [1, 2, 1], [1, 3, 1], [1, 4, 1]]

this could be improved by weighting the selection by the count of the word raised to 3/4 over the sum of the counts of all words raised to 3/4.

we can also try resampling with each epoch

In [257]:
falses = []
# create 3 randomly sampled context words for each true context word
# these may be true, but probably not. we'll label them false
for i in range(len(trues)):
  for j in range(3):
    center_index = trues[i][0]
    context_index = random.sample(range(len(context_voc)), 1)[0]
    falses.append([center_index, context_index, 0])

In [258]:
def gen_onehot():
  # combine and shuffle trues and falses
  together = np.concatenate((np.array(trues), np.array(falses)))
  np.random.shuffle(together)
  targets = torch.Tensor(together).long()

  # matrices to one hot encode middle words and target words
  middle_tensor = torch.zeros(targets.shape[0], gloss_lists.shape[0])
  context_tensor = torch.zeros(targets.shape[0], len(context_voc))

  for i in range(middle_tensor.shape[0]):
    middle_tensor[i, targets[i, 0]] = 1
    context_tensor[i, targets[i, 1]] = 1

  labels = targets[:, 2].float()
  return (middle_tensor, context_tensor, labels)

build the model itself below. not sure why bias needs to be false, should try as true.

this would be more elegant as a class or at least a function combined with the forward (dot product production) part included.

In [259]:
# fully connected middle layers for middle and target words
mid_fc = torch.nn.Linear(gloss_lists.shape[0], 256, bias = False)
con_fc = torch.nn.Linear(len(context_voc), 256, bias = False)
torch.nn.init.xavier_uniform_(mid_fc.weight)
torch.nn.init.xavier_uniform_(con_fc.weight)
sig = torch.nn.Sigmoid()
params = list(mid_fc.parameters()) + list(con_fc.parameters())
optim = torch.optim.Adam(params, lr = .001)
loss_fn = torch.nn.BCELoss()

In [260]:
epochs = 100
mid_hot, con_hot, labels = gen_onehot()

for i in range(epochs):
  trans_mid = mid_fc(torch.Tensor(mid_hot))
  trans_con = con_fc(torch.Tensor(con_hot))
  dot_mat = torch.zeros(mid_hot.shape[0], 1)
  # for each row dot a center embedding by a target embedding
  for j in range(len(trans_mid)):
    dot_mat[j, :] = trans_mid[j, :] @ trans_con[j, :]
  # sigmoid transformation, then compute the gradient and backwards propagate
  prob_mat = sig(dot_mat)
  prob_mat.requires_grad = True
  optim.zero_grad()
  loss = loss_fn(prob_mat, torch.Tensor(labels).view(prob_mat.shape[0], 1))
  loss.backward()
  optim.step()

  # print loss every 10 epochs
  if i % 10 == 0:
    print(loss.data)
  mid_hot, con_hot, labels = gen_onehot()

tensor(0.6937)
tensor(0.6937)
tensor(0.6937)
tensor(0.6937)
tensor(0.6937)
tensor(0.6937)
tensor(0.6937)
tensor(0.6937)
tensor(0.6937)
tensor(0.6937)


In [261]:

print(loss.data)

tensor(0.6937)


In [262]:
embeddings = mid_fc.weight.t().detach().numpy()
X_train, X_test, y_train, y_test = train_test_split(embeddings, en_df)

In [263]:
YTestDic = list()
YTrainDic = list()

YTest = list()
YTrain = list()
#YTest = [y_train.id.values, y_train.word.values]


yTeId = y_test.id.index
yTeSGNS = y_test.sgns.values

yTrId = y_train.id.index
yTrSGNS = y_train.sgns.values

for x in range(len(yTeId)):
  YTestDic.append([yTeId[x],yTeSGNS[x]])

for x in range(len(yTrId)):
  YTrainDic.append([yTrId[x],yTrSGNS[x]])
print(YTestDic)

for ytr in y_train.sgns.values:
  YTrain.append(ytr)

for yte in y_test.sgns.values:
  YTest.append(yte)

[[86, [0.0169046223, -0.0773857832, -0.014066747400000001, 0.18011866510000002, 0.0573918708, 0.0077605061, 0.0164936706, 0.016554134, -0.1174012125, -0.0614891611, 0.0555207916, 0.035025320900000004, 0.11364445840000001, -0.08243993670000001, -0.14729364220000002, -0.1147165447, 0.1256827861, 0.0220087599, -0.0953138173, -0.0060451701, 0.014500550000000001, 0.0689249784, 0.06437454370000001, -0.0595811009, 0.11599371580000001, 0.1296711266, 0.1021747887, 0.0118577434, 0.0489990376, -0.0406023301, 0.1502872109, -0.0869918913, -0.0976809114, 0.10266772660000001, -0.1651863009, 0.0917757899, 0.1381960362, -0.14646032450000002, -0.0022827014, 0.1610870063, -0.09532671420000001, 0.0175175276, 0.021946031600000002, 0.0130878985, 0.1969512105, 0.1258774102, 0.0647103935, 0.0271613151, -0.0470400341, -0.1435290277, 0.0641253665, 0.09878072140000001, 0.1358776987, 0.1700888872, 0.0744355693, -0.00987346, 0.0003292956, -0.0498144701, -0.1967547536, 0.12639316920000002, 0.14031110700000002, 0.04

In [264]:
print(X_train)

[[-0.06115916 -0.03611128 -0.11310474 ... -0.00308185  0.0881393
  -0.00895664]
 [-0.08259233 -0.08396223 -0.05387131 ... -0.04167905 -0.11219437
   0.03978181]
 [ 0.03718065  0.11357757  0.04570163 ... -0.05301177  0.05869182
  -0.04206215]
 ...
 [ 0.04626404  0.04935453 -0.04714448 ...  0.06137358  0.04925727
   0.03527595]
 [ 0.04971303 -0.0694185   0.05395081 ... -0.00652741  0.04347246
  -0.00566811]
 [ 0.04050392  0.09930833  0.0103789  ... -0.0396491   0.11322645
  -0.08119117]]


In [265]:
print(X_test.shape, "\n", y_test.shape)

(50, 256) 
 (50, 13)


In [266]:

X_train = np.asarray( X_train, dtype = np.float32 )
X_test = np.asarray( X_test, dtype = np.float32 )

YTrain = np.asarray(YTrain, dtype = np.float32 )
YTest = np.asarray( YTest, dtype = np.float32 )

Nueral Network

In [457]:
inputs = tf.keras.Input(shape=(256,), dtype="float32")

x = layers.Dropout(0.1)(inputs)

x = layers.Dense(128, activation="tanh")(x)
x = layers.Dropout(0.05)(x)

x = layers.Dense(55, activation="tanh")(x)
#x = layers.Dropout(0.1)(x)

x = layers.Dense(55, activation="tanh")(x)
#x = layers.Dropout(0.1)(x)

x = layers.Dense(50, activation="tanh")(x)
#x = layers.Dropout(0.1)(x)

predictions = layers.Dense(256, activation="tanh", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=[CosineSimilarity(axis=1)])

Training

In [458]:
epochs = 35

# Fit the model using the train and test datasets.
model.fit(X_train, YTrain, epochs=epochs)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7f61f9aac950>

In [459]:
 #Compare Model Prediction To All Embeddings And See Which Are Most Similar
def Cosine_Similarity( x_instance, y_instance ):
    dot_product_value  = np.dot( x_instance, y_instance )
    x_instance_l2_norm = np.linalg.norm( x_instance, ord = 2 )
    y_instance_l2_norm = np.linalg.norm( y_instance, ord = 2 )
    cross_product      = x_instance_l2_norm * y_instance_l2_norm
    return dot_product_value / cross_product   

Cosine Similarity

In [460]:
pw_idx = []
pred_word_emb = model.predict(X_test, verbose =0)

for def_emb in pred_word_emb:
  h_cossim_value = -1
  h_cossim_index = -1
  for idx, true_embedding in YTestDic:
    cossim_value = Cosine_Similarity(def_emb, true_embedding )
    if(cossim_value > h_cossim_value):
      h_cossim_value = cossim_value
      h_cossim_index = idx
  pw_idx.append(h_cossim_index)

num_correct, total_inst = 0,0

for x in range(len(pw_idx)):
  if pw_idx[x] == YTestDic[x][0] :
    num_correct += 1
  total_inst += 1
print("Cosine Similarity: ", ((num_correct/total_inst)*100), "%")

Cosine Similarity:  4.0 %


K-Cosine Similarity

In [465]:
pw_idx = []
K_cs = []
K_list = []
k = 17
pred_word_emb = model.predict(X_test, verbose =0)

for def_emb in pred_word_emb:
  for idx, true_embedding in YTestDic:
    K_cs.append([Cosine_Similarity(def_emb, true_embedding ), idx])
  K_cs.sort()
  K_cs = K_cs[len(K_cs)-k:len(K_cs)].copy()
  
  for x in K_cs:
    K_list.append(x[1])

  K_cs.clear()

  pw_idx.append(K_list.copy())
  K_list.clear()

num_correct, total_inst = 0,0

for x in range(len(pw_idx)):
  if YTestDic[x][0] in pw_idx[x]:
    num_correct += 1
  total_inst += 1
print("Cosine Similarity(",k,"): ", ((num_correct/total_inst)*100), "%")

Cosine Similarity( 17 ):  34.0 %


Square Mean Error

In [None]:
pred_word_emb = model.predict(X_test, verbose =0)

for def_emb in pred_word_emb:
  h_mse_value = -1
  h_mse_index = -1
  