## Library import

In [None]:
from google.colab import drive
drive.mount("/content/drive")

! pip install transformers
! pip install --upgrade gensim
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

Mounted at /content/drive


In [None]:
# Language processing
import string
import nltk
import re

# System
import sys
path_smt = '/content/drive/MyDrive/dis/'
sys.path.append(path_smt)

# Data preprocessing
from preprocessing import *
import pandas as pd
np.random.seed(1)
import numpy as np
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler

# Statistical tools
import scipy.stats as stat

# Plotting
import matplotlib.pyplot as plt

# Word2vec
import gensim
import gensim.downloader
from gensim.models import Word2Vec
google_news_vectors = gensim.downloader.load('word2vec-google-news-300')

# Glove
embeddings_dict = {}
with open(path_smt + "glove.6B.50d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

## Data import

In [None]:
data = pd.read_csv(path_smt+"SMTeuroparl.train.tsv", sep='\t', encoding='utf-8', header = None, 
                 names = ['sim', 'sent1', 'sent2'])
data["sim01"] = (data["sim"]-min(data["sim"]))/(max(data["sim"])-min(data["sim"]))

## Preprocessing

In [None]:
preprocess("sent1", data)
preprocess("sent2", data)
# Last preprocessing step of tokenisation
data["sent1_stop"] = data.apply(lambda row: nltk.word_tokenize(row['sent1_punct']), axis=1)
data["sent2_stop"] = data.apply(lambda row: nltk.word_tokenize(row['sent2_punct']), axis=1)

# Out-of-vocabulary words and numbers addition

In [None]:
# Obtain the whole vocabulary
sent1_set = set(' '.join(data['sent1_punct']).split())
sent2_set = set(' '.join(data['sent2_punct']).split())
all_words = sent1_set.union(sent2_set)

# Dictionary keys to find missing keys
google_lst = google_news_vectors.index_to_key
glove_keys = embeddings_dict.keys()
add_google = all_words.difference(google_lst)
add_glove = all_words.difference(glove_keys)

In [None]:
# Add numbers for Glove
for wrd in add_glove:
    if wrd.isnumeric():
        vec_nb = np.zeros((len(wrd)-2, 50))
        vec_nb[0, :] = embeddings_dict[str(int(wrd[-3:]))]
        for k in range(3, len(wrd)):
            nb = int(wrd[-(k+1):]) - int(wrd) % 10**(k)
            if str(nb) in embeddings_dict:
                vec_nb[k-2, :] = embeddings_dict[str(nb)]
        embeddings_dict[wrd] = np.average(vec_nb, axis = 0)
    # else:
    # embeddings_dict[wrd] = np.random.normal(size = 50)
        
# Add numbers for Google
for wrd in add_google:
    if wrd.isnumeric():
        vec_nb = np.zeros((len(wrd), 300))
        for k in range(len(wrd)):
            vec_nb[k, :] = google_news_vectors[wrd[k]]
        google_news_vectors[wrd] = np.average(vec_nb, axis = 0)                   
    # else:
    # google_news_vectors[wrd] = np.random.normal(size = 300)

## Vector average

In [None]:
# Compute vector average for two sentences
def comp_aver(var,  diction, j, base_emb):
  # Set vector dimensions based on the word embeddings in use
    if base_emb == "glove":
        len_emb = 50
    else:
        len_emb = 300

# Empty vector for word embeddings
    word_vectors = np.zeros((len_emb, len(var[j])))
    for i in range(len(var[j])):
        wrd = var[j][i]
        # Obtain specific word embedding
        word_vectors[:, i] = diction[wrd]
#             Return the average of all word vectors
    return(np.sum(word_vectors, axis = 1)/len(var[j]))

# Compute vector average for dataframe
def add_stats(dt1, dt2, emb, dicti):
    sims = np.zeros(len(dt1))
    for jj in range(len(dt1)):
        avg_vec1 = comp_aver(dt1, base_emb = emb, diction = dicti, j = jj)
        avg_vec2 = comp_aver(dt2, base_emb = emb, diction = dicti, j = jj)

#         Compute cosine distance
        sims[jj] = 1 - spatial.distance.cosine(avg_vec1, avg_vec2)

    # Adding the calculated similarity to the dataframe
    data.loc[:, emb + "_sim"] = sims

In [None]:
add_stats(data.sent1_stop, data.sent2_stop, emb = "google", dicti = google_news_vectors)
add_stats(data.sent1_stop, data.sent2_stop, emb = "glove", dicti = embeddings_dict)

## WMD

In [None]:
sims_glove = np.zeros(len(data))
sims_google = np.zeros(len(data))
# Calculating WMD
for i in range(len(data)):
    sims_glove[i] = wmdist(embeddings_dict, data.sent1_stop[i], data.sent2_stop[i])
    sims_google[i] = wmdist(google_news_vectors, data.sent1_stop[i], data.sent2_stop[i])
    
# Adding the inverted distance to dataframe
data["wmd_google"] = 1/(1+sims_google)
data["wmd_glove"] = 1/(1+sims_glove)

## Model evaluation

In [None]:
print("Correlation for average Word2vec:", stat.pearsonr(data["google_sim"], data["sim"])[0])
print("Correlation for average Glove:", stat.pearsonr(data["glove_sim"], data["sim"])[0])

Correlation for average Word2vec: 0.4194033766042279
Correlation for average Glove: 0.6496622253231086


In [None]:
print("Correlation for wmd Word2vec:", stat.pearsonr(data["wmd_google"], data["sim"])[0])
print("Correlation for wmd Glove:", stat.pearsonr(data["wmd_glove"], data["sim"])[0])

Correlation for wmd nbs Word2vec: 0.5271859140500685
Correlation for wmd nbs Glove: 0.6002472882634206


### Investigation of what contexts numbers occur in

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
  print(data.sent1.str.extractall(r"(\d+)").unstack())

          0                      
match     0     1     2    3    4
11        8   NaN   NaN  NaN  NaN
18       11   NaN   NaN  NaN  NaN
24       24   NaN   NaN  NaN  NaN
30       16  2000   NaN  NaN  NaN
38      299     2   NaN  NaN  NaN
43      299     2   NaN  NaN  NaN
54      150   NaN   NaN  NaN  NaN
59        1   NaN   NaN  NaN  NaN
72       16  2000   NaN  NaN  NaN
77      150   NaN   NaN  NaN  NaN
92      500     3     0   39  924
114    2001   NaN   NaN  NaN  NaN
122       7   NaN   NaN  NaN  NaN
126       8   NaN   NaN  NaN  NaN
128    1996   NaN   NaN  NaN  NaN
134     250   120   NaN  NaN  NaN
148    2001   NaN   NaN  NaN  NaN
149     250   120   NaN  NaN  NaN
162     272   NaN   NaN  NaN  NaN
178       8   NaN   NaN  NaN  NaN
180       5  0794  2000  NaN  NaN
186     272   NaN   NaN  NaN  NaN
202       2    06     3  NaN  NaN
204       6  0886    00  NaN  NaN
205    1996   NaN   NaN  NaN  NaN
233    2001   NaN   NaN  NaN  NaN
243     299     2   NaN  NaN  NaN
249       5  0

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
  print(data.sent2.str.extractall(r"(\d+)").unstack())

          0                      
match     0     1     2    3    4
11        8   NaN   NaN  NaN  NaN
18       11   NaN   NaN  NaN  NaN
24       57    24   NaN  NaN  NaN
30       16  2000   NaN  NaN  NaN
38      299     2   NaN  NaN  NaN
43      299     2   NaN  NaN  NaN
54      150   NaN   NaN  NaN  NaN
59        1   NaN   NaN  NaN  NaN
72       16  2000   NaN  NaN  NaN
77      150   NaN   NaN  NaN  NaN
92        6     0    41    1  498
114    2001   NaN   NaN  NaN  NaN
122       7   NaN   NaN  NaN  NaN
126       8   NaN   NaN  NaN  NaN
128    1996   NaN   NaN  NaN  NaN
134     250   120   NaN  NaN  NaN
148    2001   NaN   NaN  NaN  NaN
149     250   120   NaN  NaN  NaN
162     272   NaN   NaN  NaN  NaN
178       8   NaN   NaN  NaN  NaN
180       5  0794  2000  NaN  NaN
186     272   NaN   NaN  NaN  NaN
202      14    06     3  NaN  NaN
204       6  0886    00  NaN  NaN
205    1996   NaN   NaN  NaN  NaN
233    2001   NaN   NaN  NaN  NaN
243     299     2   NaN  NaN  NaN
249       5  0

Sentences with numbers.

In [None]:
data[data.sent1.str.contains(r"\d+")]

Unnamed: 0,sim,sent1,sent2,sim01,sent1_punct,sent2_punct,sent1_stop,sent2_stop,google_sim,glove_sim,wmd_google,wmd_glove
11,4.2,He will stand trial on 8 January on charges of...,It passes in lawsuit next on January 8. It is ...,0.84,stand trial 8 january charges attended meeting...,passes lawsuit next january 8 reproached taken...,"[stand, trial, 8, january, charges, attended, ...","[passes, lawsuit, next, january, 8, reproached...",0.923275,0.952771,0.690925,0.797823
18,4.8,"Mr President, the Commission' s attitude to th...","Mr President, we can see the Commission's posi...",0.96,mr president commission attitude right access ...,mr president see commission position terms pub...,"[mr, president, commission, attitude, right, a...","[mr, president, see, commission, position, ter...",0.807435,0.992223,0.744582,0.852462
24,5.0,"Fifty-seven senators, including 24 Republicans...","Of those who signed the letter, 57 are senator...",1.00,fiftyseven senators including 24 republicans s...,signed letter 57 senators including 24 republi...,"[fiftyseven, senators, including, 24, republic...","[signed, letter, 57, senators, including, 24, ...",0.518123,0.946429,0.875752,0.898666
30,4.4,Reiterating the calls made by the European Par...,"As the European Parliament, in its resolution ...",0.88,reiterating calls made european parliament res...,european parliament resolution 16 march 2000 i...,"[reiterating, calls, made, european, parliamen...","[european, parliament, resolution, 16, march, ...",0.966281,0.991883,0.794706,0.897016
38,4.0,"As a matter of urgency, therefore, the staff c...","Therefore, it is urgent that the personnel of ...",0.80,matter urgency therefore staff complement inte...,therefore urgent personnel interservice group ...,"[matter, urgency, therefore, staff, complement...","[therefore, urgent, personnel, interservice, g...",0.604373,0.940241,0.669141,0.747385
...,...,...,...,...,...,...,...,...,...,...,...,...
707,3.8,He will stand trial on 8 January on charges of...,It happening in trial on 8 January. he is accu...,0.76,stand trial 8 january charges attended meeting...,happening trial 8 january accused taking part ...,"[stand, trial, 8, january, charges, attended, ...","[happening, trial, 8, january, accused, taking...",0.949604,0.951461,0.731166,0.833851
709,5.0,Question No 6 by (H-0886/00):,Question No 6 by (H-0886/00):,1.00,question no 6 h088600,question no 6 h088600,"[question, no, 6, h088600]","[question, no, 6, h088600]",1.000000,1.000000,1.000000,1.000000
710,3.8,As long ago as 1996 the European Parliament ca...,"In 1996, the European Parliament is in favour ...",0.76,long ago 1996 european parliament came favor b...,1996 european parliament favor ban use europea...,"[long, ago, 1996, european, parliament, came, ...","[1996, european, parliament, favor, ban, use, ...",0.802647,0.969966,0.756461,0.846127
716,4.4,Reiterating the calls made by the European Par...,As the European Parliament called for in its r...,0.88,reiterating calls made european parliament res...,european parliament called resolution 16 march...,"[reiterating, calls, made, european, parliamen...","[european, parliament, called, resolution, 16,...",0.976658,0.993452,0.816811,0.907602


In [None]:
data.iloc[92].sent1

"The broader Standard & Poor's 500 Index .SPX gained 3 points, or 0.39 percent, at 924."

In [None]:
data.iloc[92].sent2

'The technology-laced Nasdaq Composite Index <.IXIC> rose 6 points, or 0.41 percent, to 1,498.'