# STS Benchmark Datasets

For more information, see the implementation of https://github.com/kawine/usif

In [1]:
%load_ext autotime
import numpy as np
import logging
import pandas as pd
import pathlib

from gensim.models import Word2Vec, FastText
from gensim.models import KeyedVectors

from fse.models.average import Average
from fse.models.sif import SIF
from fse.models.inputs import IndexedSentence

from re import sub

from scipy.stats import pearsonr

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
                    level=logging.INFO
                   )

file_path = "data/stsbenchmark/sts-test.csv"

def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    return sims

df = pd.read_csv(file_path, sep=",", error_bad_lines=True, names=["task","1","2","3","sim","sent_a","sent_b"])
df.dropna(inplace=True)
df.sent_a = df.sent_a.apply(normalize_text)
df.sent_b = df.sent_b.apply(normalize_text)

sents_a = df.sent_a.values.tolist()
sents_b = df.sent_b.values.tolist()
assert len(sents_a) == len(sents_b)

task_length = len(sents_a)
sents_a.extend(sents_b)
sents = [IndexedSentence(s, i) for i,s in enumerate(sents_a)]

w2v = KeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model")
#mft = FastText(dataset, size=20, iter=5, workers=8)

2019-08-18 14:33:24,344 : MainThread : INFO : loading Word2VecKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-18 14:33:30,451 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model.vectors.npy with mmap=None
2019-08-18 14:33:32,615 : MainThread : INFO : setting ignored attribute vectors_norm to None
2019-08-18 14:33:32,616 : MainThread : INFO : loaded /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model


In [2]:
models = {
    "CBOW-W2V" : Average(w2v, lang_freq="en"),
    "SIF-1C-W2V" : SIF(w2v),
    "SIF-2C-W2V" : SIF(w2v, components=2),
}

results = {}
for k, m in models.items():
    m.train(sents)
    r = pearsonr(df.sim, compute_similarities(task_length, m))[0].round(4)
    results[k] = r * 100
    print(k, r)

2019-08-18 14:33:32,626 : MainThread : INFO : no frequency mode: using wordfreq for estimationof frequency for language: en
2019-08-18 14:33:34,999 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-18 14:33:35,008 : MainThread : INFO : finished scanning 2758 sentences with an average length of 10 and 29895 total words
2019-08-18 14:33:36,041 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-18 14:33:36,042 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-18 14:33:36,096 : MainThread : INFO : begin training
2019-08-18 14:33:36,113 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-18 14:33:37,107 : MainThread : INFO : training on 2758 effective sentences with 22877 effective words took 0s with 157240 sentences/s
2019-08-18 14:33:37,134 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-

CBOW-W2V 0.6207


2019-08-18 14:33:38,170 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-18 14:33:38,170 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-18 14:33:38,179 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-18 14:33:41,267 : MainThread : INFO : begin training
2019-08-18 14:33:41,283 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-18 14:33:42,208 : MainThread : INFO : computing 1 principal components
2019-08-18 14:33:42,238 : MainThread : INFO : removing 1 principal components
2019-08-18 14:33:42,241 : MainThread : INFO : training on 2758 effective sentences with 22877 effective words took 0s with 166265 sentences/s
2019-08-18 14:33:42,269 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-18 14:33:42,278 : MainThread : INFO : finished scanning 2758 sentences with an average length of 10 and

SIF-1C-W2V 0.7026


2019-08-18 14:33:43,293 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-18 14:33:43,294 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-18 14:33:43,300 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-18 14:33:46,637 : MainThread : INFO : begin training
2019-08-18 14:33:46,658 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-18 14:33:47,525 : MainThread : INFO : computing 2 principal components
2019-08-18 14:33:47,539 : MainThread : INFO : removing 2 principal components
2019-08-18 14:33:47,541 : MainThread : INFO : training on 2758 effective sentences with 22877 effective words took 0s with 125113 sentences/s
2019-08-18 14:33:47,573 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-18 14:33:47,582 : MainThread : INFO : finished scanning 2758 sentences with an average length of 10 and

SIF-2C-W2V 0.703


2019-08-18 14:33:48,681 : MainThread : INFO : estimated memory for 2758 sentences with 300 dimensions and 3000000 vocabulary: 3447 MB (3 GB)
2019-08-18 14:33:48,682 : MainThread : INFO : initializing sentence vectors for 2758 sentences
2019-08-18 14:33:48,689 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-18 14:33:51,911 : MainThread : INFO : begin training
2019-08-18 14:33:51,932 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-18 14:33:52,879 : MainThread : INFO : computing 2 principal components
2019-08-18 14:33:52,895 : MainThread : INFO : removing 2 principal components
2019-08-18 14:33:52,898 : MainThread : INFO : training on 2758 effective sentences with 22877 effective words took 0s with 128119 sentences/s


SIF-2C-1e-4-W2V 0.6812
time: 20.3 s


In [3]:
pd.DataFrame.from_dict(results, orient="index", columns=["Pearson"])

Unnamed: 0,Pearson
CBOW-W2V,62.07
SIF-1C-W2V,70.26
SIF-2C-W2V,70.3
SIF-2C-1e-4-W2V,68.12


time: 22.7 ms
