# STS Benchmark Datasets

For more information, see the implementation of https://github.com/kawine/usif

In [1]:
%load_ext autotime
import numpy as np
import logging
import pandas as pd
import pathlib

from gensim.models import Word2Vec, FastText
from gensim.models import KeyedVectors

from fse.models.average import Average
from fse.models.inputs import IndexedSentence

from re import sub

from scipy.stats import pearsonr

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
                    level=logging.INFO
                   )

file_path = "data/stsbenchmark/sts-test.csv"

def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    return sims

df = pd.read_csv(file_path, sep=",", error_bad_lines=True, names=["task","1","2","3","sim","sent_a","sent_b"])
df.dropna(inplace=True)
df.sent_a = df.sent_a.apply(normalize_text)
df.sent_b = df.sent_b.apply(normalize_text)

sents_a = df.sent_a.values.tolist()
sents_b = df.sent_b.values.tolist()
assert len(sents_a) == len(sents_b)

task_length = len(sents_a)
sents_a.extend(sents_b)
sents = [IndexedSentence(s, i) for i,s in enumerate(sents_a)]

In [2]:
w2v = KeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model")
#mft = FastText(dataset, size=20, iter=5, workers=8)

2019-08-18 13:58:02,157 : MainThread : INFO : loading Word2VecKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-18 13:58:07,632 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model.vectors.npy with mmap=None
2019-08-18 13:58:10,062 : MainThread : INFO : setting ignored attribute vectors_norm to None
2019-08-18 13:58:10,063 : MainThread : INFO : loaded /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model


time: 7.91 s


In [3]:
models = {
    "CBOW-W2V" : Average(w2v, lang_freq="en")
}

2019-08-18 13:58:10,072 : MainThread : INFO : no frequency mode: using wordfreq for estimationof frequency for language: en


time: 2.03 s


In [4]:
results = {}
for k, m in models.items():
    m.train(sents)
    r = pearsonr(df.sim, compute_similarities(task_length, m))[0].round(3)
    results[k] = r * 100
    print(k, r)

2019-08-18 13:58:12,106 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-18 13:58:12,115 : MainThread : INFO : finished scanning 2758 sentences with an average length of 10 and 29895 total words
2019-08-18 13:58:12,989 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-18 13:58:12,990 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-18 13:58:13,044 : MainThread : INFO : begin training
2019-08-18 13:58:13,061 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-18 13:58:13,942 : MainThread : INFO : training on 2758 effective sentences with 22877 effective words took 0s with 154884 sentences/s


CBOW-W2V 0.621
time: 1.87 s


In [5]:
pd.DataFrame.from_dict(results, orient="index", columns=["Pearson"])

Unnamed: 0,Pearson
CBOW-W2V,62.1


time: 20.5 ms
