In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import random
import numpy as np

def sample(defs_count, embs):
    words = []
    for word in embs.keys():
        if word in defs_count:
            words.append(word)
            
    random.shuffle(words)
    
    X = np.array([embs[w] for w in words])
    y = np.array([defs_count[w] for w in words])
    
    return X, y

In [None]:
def accuracy_1(y_true, y_predicted):
  count = 0
  for i in range(len(y_true)):
    if y_true[i] == round(y_predicted[i]):
      count += 1
  return count / len(y_true)

In [None]:
def accuracy_2(y_true, y_predicted, eps):
  count = 0
  for i in range(len(y_true)):
    if abs(y_true[i] - y_predicted[i]) <= eps:
      count += 1
  return count / len(y_true)

In [None]:
from os import XATTR_REPLACE
import pandas as pd
from nltk.corpus import wordnet as wn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

def evaluate(ds_filename, n):
    df = pd.read_table(ds_filename, header=None)
    
    embeddings = dict() #key - word, value - embedding
    for data in df.values:
        embeddings[data[0]] = data[1:]
    
    definitions_num = dict() #key - word, value - number of synsets
    for lemma in wn.all_lemma_names():
      definitions_num[lemma] = len(wn.synsets(lemma))
    
    X, y = sample(definitions_num, embeddings)
    
    MSE = np.zeros(n)
    MAE = np.zeros(n)
    acc_1 = np.zeros(n)
    acc_2 = np.zeros(n) 

    for i in range(n):
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
      lin_reg = LinearRegression().fit(X_train, y_train)
    
      y_pred = lin_reg.predict(X_test)
      y_true = y_test

      MSE[i] = mean_squared_error(y_true, y_pred)
      MAE[i] = mean_absolute_error(y_true, y_pred)
      acc_1[i] = accuracy_1(y_true, y_pred)
      acc_2[i] = accuracy_2(y_true, y_pred, 1)
    
    return acc_1.mean(), acc_2.mean(), MSE.mean(), MAE.mean(), len(X)


dim = 10

In [1]:
acc_1, acc_2, MSE, MAE, ds_size = evaluate("/content/wordnet_embeddings_1.tsv", 100)

print(acc_1, acc_2, MSE, MAE, ds_size)

NameError: ignored

dim = 2

In [None]:
acc_1, acc_2, MSE, MAE, ds_size = evaluate("/content/wordnet_embeddings_2.tsv", 100)

print(acc_1, acc_2, MSE, MAE, ds_size)

0.3200669892072943 0.8775199106810567 5.116235428974448 1.029472259260257 67172


dim = 100

In [None]:
acc_1, acc_2, MSE, MAE, ds_size = evaluate("/content/wordnet_embeddings_3.tsv", 100)

print(acc_1, acc_2, MSE, MAE, ds_size)

0.40264532936360253 0.7568492742835877 4.982347716069422 0.9835145702498126 67172


dim = 2, v2.0

In [None]:
acc_1, acc_2, MSE, MAE, ds_size = evaluate("/content/wordnet_embeddings_7.tsv", 100)

print(acc_1, acc_2, MSE, MAE, ds_size)

0.2201741719389654 0.879510978786751 5.241157914344801 1.0398291439288463 67172


dim = 10, v2.0


In [None]:
acc_1, acc_2, MSE, MAE, ds_size = evaluate("/content/wordnet_embeddings_6.tsv", 100)

print(acc_1, acc_2, MSE, MAE, ds_size)

0.3523572757722367 0.7726185336806849 5.002628822725369 1.0138703071560842 67172


dim = 5

In [None]:
acc_1, acc_2, MSE, MAE, ds_size = evaluate("/content/wordnet_embeddings_8.tsv", 100)

print(acc_1, acc_2, MSE, MAE, ds_size)

0.32740230740602905 0.7687718645329363 5.11674478994841 1.0182813150849672 67172


dim = 2, v3.0

In [None]:
acc_1, acc_2, MSE, MAE, ds_size = evaluate("/content/wordnet_embeddings_9.tsv", 100)

print(acc_1, acc_2, MSE, MAE, ds_size)

0.28747376256047635 0.8798094529214737 5.118903820448165 1.0273001811967604 67172
