## Library import

In [None]:
from google.colab import drive
drive.mount("/content/drive")

! pip install transformers
! pip install --upgrade gensim
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

Mounted at /content/drive


In [None]:
# Language processing
import string
import nltk
import re

# System
import sys
path_smt = '/content/drive/MyDrive/dis/'
sys.path.append(path_smt)

# Data preprocessing
from preprocessing import * 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Statistical tools
import scipy.stats as stat

# Plotting
import matplotlib.pyplot as plt

# Word2vec
import gensim
import gensim.downloader
from gensim.models import Word2Vec
google_news_vectors = gensim.downloader.load('word2vec-google-news-300')

## Functions

In [None]:
# Vector average
def comp_aver(var,  j):
    word_vectors = np.zeros((300, len(var[j])))
    for i in range(len(var[j])):
        wrd = var[j][i]
        word_vectors[:, i] = google_news_vectors[wrd]
#             Return the average of all word vectors
    return(np.sum(word_vectors, axis = 1)/len(var[j]))

# Vector average for the whole dataframe
def add_stats(dt1, dt2):
    sims = np.zeros(len(dt1))
    for jj in range(len(dt1)):
        avg_vec1 = comp_aver(dt1, j = jj)
        avg_vec2 = comp_aver(dt2, j = jj)

#         Compute cosine distance
        sims[jj] = 1 - spatial.distance.cosine(avg_vec1, avg_vec2)

    return(sims)

In [None]:
# Compute correlation for a dataset
def subsets(dt):
  # Create a dataframe with the whole vocabulary
  dt_long = pd.concat([dt[["sent1"]].rename(columns = {"sent1":"sent"}), 
                     dt[["sent2"]].rename(columns={"sent2":"sent"})], ignore_index=True)
  # Preprocess
  preprocess("sent", dt_long)
  # Creating the vocabulary collection
  all_words = set(' '.join(dt_long['sent_punct']).split())   
  # Subsetting all words that are not present
  add_google = all_words.difference(google_lst)
  # Add OOV words to the dictionary
  for wrd in add_google:
    google_news_vectors[wrd] = np.random.rand(300)

  # Add the preprocessed words to the original dataframe
  dt.loc[:,("sent_punct1")] = dt_long.sent_punct[0:int((len(dt_long)/2))]
  dt.loc[:,("sent_punct2")] = dt_long[int((len(dt_long)/2)):].reset_index().sent_punct
  # Tokenising as the last preprcessing step
  sent1 = dt.apply(lambda row: nltk.word_tokenize(row["sent_punct1"]), axis=1)
  sent2 = dt.apply(lambda row: nltk.word_tokenize(row["sent_punct2"]), axis=1)

  # Vector average
  dt["aver"] = add_stats(sent1, sent2)

  # WMD
  sims = np.zeros(len(dt))
  for i in range(len(dt)):
    sims[i] = wmdist(google_news_vectors, sent1[i], sent2[i])
  # Adding the inverted distance to dataframe
  dt["wmd"] = 1/(1+sims)
  # Correlation for vector average
  cor_aver = stat.pearsonr(dt["aver"], dt["sim"])[0]
  # Correlation for WMD
  cor_wmd = stat.pearsonr(dt["wmd"], dt["sim"])[0]
  return cor_aver, cor_wmd

## Data import

In [None]:
par = pd.read_csv(path_smt+"MSRpar.test.tsv", sep='\t', encoding='utf-8', header = None, 
                 names = ['sim', 'sent1', 'sent2'])
vid = pd.read_csv(path_smt+"MSRvid.test.tsv", sep='\t', encoding='utf-8')[["sent1", "sent2", "sim"]]
europarl = pd.read_csv(path_smt+"SMTeuroparl.test.tsv", sep='\t', encoding='utf-8', header = None, 
                 names = ['sim', 'sent1', 'sent2'])
wn = pd.read_csv(path_smt+"OnWN.test.tsv", sep='\t', encoding='utf-8', header = None, 
                 names = ['sim', 'sent1', 'sent2'])
news = pd.read_csv(path_smt+"SMTnews.test.tsv", sep='\t', encoding='utf-8', header = None, 
                 names = ['sim', 'sent1', 'sent2'])

## Correlation for each dataset separately

In [None]:
# word2vec keys from the dictionary
google_lst = google_news_vectors.index_to_key

In [None]:
subsets(par)

(0.16495297174562207, 0.4214197308002308)

In [None]:
subsets(vid)

(0.7560301689154783, 0.6667328176199839)

In [None]:
subsets(europarl)

(0.49591104375513795, 0.4946393836183523)

In [None]:
subsets(wn)

(0.6376490071848622, 0.6679255783521272)

In [None]:
subsets(news)

(0.3906771958717344, 0.45135647464089557)

# Prediction for all datasets from 2012

In [None]:
sts2012 = pd.concat([par, vid, europarl, wn, news])

In [None]:
print("Correlation for 2012 average:", stat.pearsonr(sts2012["aver"], sts2012["sim"])[0])
print("Correlation for 2012 wmd:", stat.pearsonr(sts2012["wmd"], sts2012["sim"])[0])

Correlation for 2012 average: 0.5754661193605161
Correlation for 2012 wmd: 0.5502980576648839
