# STS Benchmark Datasets

In [35]:
%load_ext autotime
import numpy as np
import logging
import pandas as pd
import pathlib

from gensim.models import Word2Vec, FastText
from gensim.models.keyedvectors import KeyedVectors, FastTextKeyedVectors

from fse.models.average import Average
from fse.models.sif import SIF
from fse.models.usif import uSIF
from fse.models.inputs import IndexedSentence

from re import sub
from scipy.stats import pearsonr

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
                    level=logging.INFO
                   )

def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    return sims

import csv

# Note: Ive replace ",", """ and ";" with "" in file for easier parsing
file= "data/stsbenchmark/sts-test.csv"
similarities, sent_a, sent_b = [], [], []
with open(file, "r") as f:
    for l in f:
        line = line.replace(";", "")
        line = l.rstrip().split("\t")
        similarities.append(float(line[4]))
        sent_a.append(line[5])
        sent_b.append(line[6])
similarities = np.array(similarities)
assert len(similarities) == len(sent_a) == len(sent_b)

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 14.4 ms


In [37]:
words = []
vecs = []
with open("/Users/oliverborchers/Desktop/GSDEV/Models/Static/glove.840B.300d.txt", "r") as f:
    for l in f:
        line = l.split()
        words.append(line[0])
        vecs.append(line[1:])

time: 22min 8s


In [None]:
glove = KeyedVectors(300)
glove.add(words, vecs)

In [None]:
glove.save("/Users/oliverborchers/Desktop/GSDEV/Models/Static/glove.840B.300d.model")

In [None]:
models, results = {}, {}

for year in tasks:
    models[f"CBOW-Glove"] = Average(glove)
    models[f"SIF-Glove"] = Average(glove)
    
    #models[f"CBOW-W2V"] = Average(w2v)#, wv_mapfile_path = "data/vectors/w2v")
    #models[f"SIF-W2V"] = SIF(w2v)#, wv_mapfile_path = "data/vectors/w2v")
    #models[f"uSIF-W2V-{year}"] = uSIF(w2v, wv_mapfile_path = "data/vectors/w2v")
    
    #models[f"CBOW-FT-{year}"] = Average(ft, wv_mapfile_path = "data/vectors/ft")
    #models[f"SIF-FT-{year}"] = SIF(ft, wv_mapfile_path = "data/vectors/ft")
    #models[f"uSIF-FT-{year}"] = uSIF(ft, wv_mapfile_path = "data/vectors/ft")

In [None]:
sents_a = sts_data.sent_a.values.tolist()
sents_b = sts_data.sent_b.values.tolist()
assert len(sents_a) == len(sents_b)
task_length = len(sents_a)
sents_a.extend(sents_b)
   
sents = [IndexedSentence(s, i) for i,s in enumerate(sents_a)]
    
for k, m in models.items():
    m_type  = k.split("-")[0]
    emb_type = k.split("-")[1]
    
    m.train(sents)
    r = pearsonr(sts_data.sim, compute_similarities(task_length, m))[0].round(4)
    results[f"{m_type}-{emb_type}-{year}"] = r * 100
    print(k, r)

In [None]:
pd.DataFrame.from_dict(results, orient="index", columns=["Pearson"])