# STS Benchmark Datasets

For more information, see the implementation of https://github.com/kawine/usif

In [1]:
%load_ext autotime
import numpy as np
import logging
import pandas as pd
import pathlib

from gensim.models import Word2Vec, FastText
from gensim.models.keyedvectors import KeyedVectors, FastTextKeyedVectors

from fse.models.average import Average
from fse.models.sif import SIF
from fse.models.usif import uSIF
from fse.models.inputs import IndexedSentence

from re import sub

from scipy.stats import pearsonr

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
                    level=logging.INFO
                   )

file_path = "data/stsbenchmark/sts-test.csv"

def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    return sims

sts_data = pd.read_csv(file_path, sep=",", error_bad_lines=True, names=["cpt","1","task","3","sim","sent_a","sent_b"])
sts_data.dropna(inplace=True)
sts_data.sent_a = sts_data.sent_a.apply(normalize_text)
sts_data.sent_b = sts_data.sent_b.apply(normalize_text)
sts_data.task = sts_data.task.str.replace("2012test", "2012")
sts_data.task = sts_data.task.str.replace("2012train", "None")
sts_data = sts_data[sts_data.task != "None"]

tasks = sorted(["2012", "2013", "2014", "2015", "2016", "2017"])

In [2]:
w2v = KeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model")
ft = FastTextKeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model")

2019-08-19 23:48:27,145 : MainThread : INFO : loading Word2VecKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-19 23:48:33,053 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model.vectors.npy with mmap=None
2019-08-19 23:48:35,252 : MainThread : INFO : setting ignored attribute vectors_norm to None
2019-08-19 23:48:35,253 : MainThread : INFO : loaded /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-19 23:48:35,254 : MainThread : INFO : loading FastTextKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model
2019-08-19 23:48:39,011 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model.vectors.npy with mmap=None
2019-08-19 23:48:41,748 : MainThread : INFO : loading vectors_vocab from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.mo

time: 20.2 s


In [3]:
w2v_avg = Average(w2v, wv_mapfile_path = "data/vectors/w2v", lang_freq="en")
ft_avg = Average(ft, wv_mapfile_path = "data/vectors/ft", lang_freq="en")

  "C extension not loaded, training/inferring will be slow. "
2019-08-19 23:48:47,439 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v_wv.vectors
2019-08-19 23:48:48,006 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2019-08-19 23:48:51,999 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/ft_wv.vectors
2019-08-19 23:48:52,293 : MainThread : INFO : loading pre-existing vocab from data/vectors/ft_vocab.vectors
2019-08-19 23:48:52,653 : MainThread : INFO : loading pre-existing ngrams from data/vectors/ft_ngrams.vectors
2019-08-19 23:48:52,959 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en


time: 8.06 s


In [4]:
models, results = {}, {}

for year in tasks:
    models[f"CBOW-W2V-{year}"] = Average(w2v, wv_mapfile_path = "data/vectors/w2v")
    models[f"SIF-W2V-{year}"] = SIF(w2v, wv_mapfile_path = "data/vectors/w2v")
    models[f"uSIF-W2V-{year}"] = uSIF(w2v, wv_mapfile_path = "data/vectors/w2v")
    
    models[f"CBOW-FT-{year}"] = Average(ft, wv_mapfile_path = "data/vectors/ft")
    models[f"SIF-FT-{year}"] = SIF(ft, wv_mapfile_path = "data/vectors/ft")
    models[f"uSIF-FT-{year}"] = uSIF(ft, wv_mapfile_path = "data/vectors/ft")
    
for k in models:
    results[k] = 0

2019-08-19 23:48:55,531 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v_wv.vectors
2019-08-19 23:48:55,550 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v_wv.vectors
2019-08-19 23:48:55,569 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v_wv.vectors
2019-08-19 23:48:55,593 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/ft_wv.vectors
2019-08-19 23:48:55,607 : MainThread : INFO : loading pre-existing vocab from data/vectors/ft_vocab.vectors
2019-08-19 23:48:55,625 : MainThread : INFO : loa

time: 1.25 s


In [5]:
for year in tasks:
    df = sts_data[sts_data.task == year]
    sents_a = df.sent_a.values.tolist()
    sents_b = df.sent_b.values.tolist()
    assert len(sents_a) == len(sents_b)

    task_length = len(sents_a)
    sents_a.extend(sents_b)
    
    sents = [IndexedSentence(s, i) for i,s in enumerate(sents_a)]
    
    for k, m in models.items():
        if k.split("-")[-1] != year:
            continue
        
        m_type  = k.split("-")[0]
        emb_type = k.split("-")[1]
        
        m.train(sents)
        r = pearsonr(df.sim, compute_similarities(task_length, m))[0].round(4)
        
        results[f"{m_type}-{emb_type}-{year}"] = r * 100
        print(k, r)

2019-08-19 23:48:56,752 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:48:56,754 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words
2019-08-19 23:48:57,763 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 23:48:57,813 : MainThread : INFO : begin training
2019-08-19 23:48:58,150 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:48:58,151 : MainThread : INFO : training on 504 effective sentences with 5060 effective words took 0s with 1493 sentences/s
2019-08-19 23:48:58,182 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:48:58,185 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


CBOW-W2V-2012 0.7467


2019-08-19 23:48:59,138 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 23:48:59,141 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 23:49:02,114 : MainThread : INFO : begin training
2019-08-19 23:49:02,136 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:02,137 : MainThread : INFO : computing 1 principal components
2019-08-19 23:49:02,187 : MainThread : INFO : removing 1 principal components
2019-08-19 23:49:02,189 : MainThread : INFO : training on 504 effective sentences with 5060 effective words took 0s with 21632 sentences/s
2019-08-19 23:49:02,196 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:02,199 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


SIF-W2V-2012 0.7874


2019-08-19 23:49:03,162 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 23:49:03,164 : MainThread : INFO : pre-computing uSIF weights for 3000000 words
2019-08-19 23:49:16,608 : MainThread : INFO : begin training
2019-08-19 23:49:16,632 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:16,633 : MainThread : INFO : computing 5 principal components
2019-08-19 23:49:16,648 : MainThread : INFO : training on 504 effective sentences with 5060 effective words took 0s with 20181 sentences/s
2019-08-19 23:49:16,654 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:16,657 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


uSIF-W2V-2012 0.7774


2019-08-19 23:49:17,407 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 23:49:17,441 : MainThread : INFO : begin training
2019-08-19 23:49:17,938 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:17,939 : MainThread : INFO : training on 504 effective sentences with 6848 effective words took 0s with 1011 sentences/s
2019-08-19 23:49:17,945 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:17,947 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


CBOW-FT-2012 0.5099


2019-08-19 23:49:18,625 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 23:49:18,627 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 23:49:20,864 : MainThread : INFO : begin training
2019-08-19 23:49:20,936 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:20,937 : MainThread : INFO : computing 1 principal components
2019-08-19 23:49:20,942 : MainThread : INFO : removing 1 principal components
2019-08-19 23:49:20,943 : MainThread : INFO : training on 504 effective sentences with 6848 effective words took 0s with 6892 sentences/s
2019-08-19 23:49:20,951 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:20,953 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


SIF-FT-2012 0.791


2019-08-19 23:49:21,730 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 23:49:21,732 : MainThread : INFO : pre-computing uSIF weights for 2000000 words
2019-08-19 23:49:30,935 : MainThread : INFO : begin training
2019-08-19 23:49:31,006 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:31,007 : MainThread : INFO : computing 5 principal components
2019-08-19 23:49:31,016 : MainThread : INFO : training on 504 effective sentences with 6848 effective words took 0s with 7017 sentences/s
2019-08-19 23:49:31,024 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:31,026 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


uSIF-FT-2012 0.7862


2019-08-19 23:49:31,999 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 23:49:32,046 : MainThread : INFO : begin training
2019-08-19 23:49:32,098 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:32,099 : MainThread : INFO : training on 144 effective sentences with 944 effective words took 0s with 2744 sentences/s
2019-08-19 23:49:32,102 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:32,103 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


CBOW-W2V-2013 0.6474


2019-08-19 23:49:33,037 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 23:49:33,039 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 23:49:36,241 : MainThread : INFO : begin training
2019-08-19 23:49:36,248 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:36,249 : MainThread : INFO : computing 1 principal components
2019-08-19 23:49:36,253 : MainThread : INFO : removing 1 principal components
2019-08-19 23:49:36,254 : MainThread : INFO : training on 144 effective sentences with 944 effective words took 0s with 18290 sentences/s
2019-08-19 23:49:36,258 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:36,259 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


SIF-W2V-2013 0.6741


2019-08-19 23:49:37,249 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 23:49:37,251 : MainThread : INFO : pre-computing uSIF weights for 3000000 words
2019-08-19 23:49:50,817 : MainThread : INFO : begin training
2019-08-19 23:49:50,825 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:50,825 : MainThread : INFO : computing 5 principal components
2019-08-19 23:49:50,832 : MainThread : INFO : training on 144 effective sentences with 944 effective words took 0s with 18195 sentences/s
2019-08-19 23:49:50,836 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:50,837 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


uSIF-W2V-2013 0.6972


2019-08-19 23:49:51,592 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 23:49:51,625 : MainThread : INFO : begin training
2019-08-19 23:49:51,754 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:51,755 : MainThread : INFO : training on 144 effective sentences with 1101 effective words took 0s with 1115 sentences/s
2019-08-19 23:49:51,757 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:51,758 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


CBOW-FT-2013 0.6506


2019-08-19 23:49:52,446 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 23:49:52,447 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 23:49:54,444 : MainThread : INFO : begin training
2019-08-19 23:49:54,458 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:49:54,459 : MainThread : INFO : computing 1 principal components
2019-08-19 23:49:54,463 : MainThread : INFO : removing 1 principal components
2019-08-19 23:49:54,464 : MainThread : INFO : training on 144 effective sentences with 1101 effective words took 0s with 9384 sentences/s
2019-08-19 23:49:54,466 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:49:54,467 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


SIF-FT-2013 0.7039


2019-08-19 23:49:55,216 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 23:49:55,217 : MainThread : INFO : pre-computing uSIF weights for 2000000 words
2019-08-19 23:50:04,689 : MainThread : INFO : begin training
2019-08-19 23:50:04,704 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:04,705 : MainThread : INFO : computing 5 principal components
2019-08-19 23:50:04,710 : MainThread : INFO : training on 144 effective sentences with 1101 effective words took 0s with 9144 sentences/s
2019-08-19 23:50:04,715 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:04,718 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


uSIF-FT-2013 0.7381


2019-08-19 23:50:05,704 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 23:50:05,752 : MainThread : INFO : begin training
2019-08-19 23:50:05,842 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:05,843 : MainThread : INFO : training on 404 effective sentences with 2833 effective words took 0s with 4453 sentences/s
2019-08-19 23:50:05,847 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:05,849 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


CBOW-W2V-2014 0.7105


2019-08-19 23:50:06,818 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 23:50:06,820 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 23:50:10,026 : MainThread : INFO : begin training
2019-08-19 23:50:10,042 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:10,043 : MainThread : INFO : computing 1 principal components
2019-08-19 23:50:10,047 : MainThread : INFO : removing 1 principal components
2019-08-19 23:50:10,048 : MainThread : INFO : training on 404 effective sentences with 2833 effective words took 0s with 23799 sentences/s
2019-08-19 23:50:10,055 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:10,058 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


SIF-W2V-2014 0.755


2019-08-19 23:50:11,052 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 23:50:11,054 : MainThread : INFO : pre-computing uSIF weights for 3000000 words
2019-08-19 23:50:24,213 : MainThread : INFO : begin training
2019-08-19 23:50:24,231 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:24,232 : MainThread : INFO : computing 5 principal components
2019-08-19 23:50:24,239 : MainThread : INFO : training on 404 effective sentences with 2833 effective words took 0s with 21046 sentences/s
2019-08-19 23:50:24,246 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:24,248 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


uSIF-W2V-2014 0.6802


2019-08-19 23:50:25,005 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 23:50:25,042 : MainThread : INFO : begin training
2019-08-19 23:50:25,190 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:25,191 : MainThread : INFO : training on 404 effective sentences with 3763 effective words took 0s with 2708 sentences/s
2019-08-19 23:50:25,196 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:25,197 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


CBOW-FT-2014 0.4882


2019-08-19 23:50:25,993 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 23:50:25,995 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 23:50:28,191 : MainThread : INFO : begin training
2019-08-19 23:50:28,233 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:28,233 : MainThread : INFO : computing 1 principal components
2019-08-19 23:50:28,237 : MainThread : INFO : removing 1 principal components
2019-08-19 23:50:28,239 : MainThread : INFO : training on 404 effective sentences with 3763 effective words took 0s with 9462 sentences/s
2019-08-19 23:50:28,244 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:28,246 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


SIF-FT-2014 0.7789


2019-08-19 23:50:29,017 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 23:50:29,019 : MainThread : INFO : pre-computing uSIF weights for 2000000 words
2019-08-19 23:50:38,563 : MainThread : INFO : begin training
2019-08-19 23:50:38,605 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:38,606 : MainThread : INFO : computing 5 principal components
2019-08-19 23:50:38,612 : MainThread : INFO : training on 404 effective sentences with 3763 effective words took 0s with 9470 sentences/s
2019-08-19 23:50:38,620 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:38,622 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


uSIF-FT-2014 0.7361


2019-08-19 23:50:39,583 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 23:50:39,632 : MainThread : INFO : begin training
2019-08-19 23:50:39,722 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:39,722 : MainThread : INFO : training on 392 effective sentences with 2849 effective words took 0s with 4333 sentences/s
2019-08-19 23:50:39,728 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:39,730 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


CBOW-W2V-2015 0.732


2019-08-19 23:50:40,691 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 23:50:40,693 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 23:50:43,835 : MainThread : INFO : begin training
2019-08-19 23:50:43,851 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:43,851 : MainThread : INFO : computing 1 principal components
2019-08-19 23:50:43,856 : MainThread : INFO : removing 1 principal components
2019-08-19 23:50:43,857 : MainThread : INFO : training on 392 effective sentences with 2849 effective words took 0s with 23391 sentences/s
2019-08-19 23:50:43,863 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:43,865 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


SIF-W2V-2015 0.783


2019-08-19 23:50:44,859 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 23:50:44,861 : MainThread : INFO : pre-computing uSIF weights for 3000000 words
2019-08-19 23:50:58,097 : MainThread : INFO : begin training
2019-08-19 23:50:58,113 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:58,114 : MainThread : INFO : computing 5 principal components
2019-08-19 23:50:58,120 : MainThread : INFO : training on 392 effective sentences with 2849 effective words took 0s with 22909 sentences/s
2019-08-19 23:50:58,126 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:58,127 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


uSIF-W2V-2015 0.7548


2019-08-19 23:50:58,777 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 23:50:58,806 : MainThread : INFO : begin training
2019-08-19 23:50:58,962 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:50:58,963 : MainThread : INFO : training on 392 effective sentences with 3640 effective words took 0s with 2500 sentences/s
2019-08-19 23:50:58,967 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:50:58,969 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


CBOW-FT-2015 0.579


2019-08-19 23:50:59,642 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 23:50:59,643 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 23:51:01,947 : MainThread : INFO : begin training
2019-08-19 23:51:01,986 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:01,987 : MainThread : INFO : computing 1 principal components
2019-08-19 23:51:01,991 : MainThread : INFO : removing 1 principal components
2019-08-19 23:51:01,993 : MainThread : INFO : training on 392 effective sentences with 3640 effective words took 0s with 9774 sentences/s
2019-08-19 23:51:01,998 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:02,000 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


SIF-FT-2015 0.7961


2019-08-19 23:51:02,777 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 23:51:02,779 : MainThread : INFO : pre-computing uSIF weights for 2000000 words
2019-08-19 23:51:12,100 : MainThread : INFO : begin training
2019-08-19 23:51:12,142 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:12,143 : MainThread : INFO : computing 5 principal components
2019-08-19 23:51:12,149 : MainThread : INFO : training on 392 effective sentences with 3640 effective words took 0s with 9202 sentences/s
2019-08-19 23:51:12,158 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:12,160 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


uSIF-FT-2015 0.7795


2019-08-19 23:51:13,128 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 23:51:13,176 : MainThread : INFO : begin training
2019-08-19 23:51:13,246 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:13,247 : MainThread : INFO : training on 568 effective sentences with 4534 effective words took 0s with 8006 sentences/s
2019-08-19 23:51:13,254 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:13,256 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


CBOW-W2V-2016 0.4194


2019-08-19 23:51:14,119 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 23:51:14,122 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 23:51:17,084 : MainThread : INFO : begin training
2019-08-19 23:51:17,107 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:17,107 : MainThread : INFO : computing 1 principal components
2019-08-19 23:51:17,112 : MainThread : INFO : removing 1 principal components
2019-08-19 23:51:17,114 : MainThread : INFO : training on 568 effective sentences with 4534 effective words took 0s with 24544 sentences/s
2019-08-19 23:51:17,121 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:17,124 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


SIF-W2V-2016 0.5533


2019-08-19 23:51:18,112 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 23:51:18,114 : MainThread : INFO : pre-computing uSIF weights for 3000000 words
2019-08-19 23:51:31,563 : MainThread : INFO : begin training
2019-08-19 23:51:31,585 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:31,585 : MainThread : INFO : computing 5 principal components
2019-08-19 23:51:31,599 : MainThread : INFO : training on 568 effective sentences with 4534 effective words took 0s with 25277 sentences/s
2019-08-19 23:51:31,606 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:31,608 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


uSIF-W2V-2016 0.5026


2019-08-19 23:51:32,334 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 23:51:32,368 : MainThread : INFO : begin training
2019-08-19 23:51:32,497 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:32,498 : MainThread : INFO : training on 568 effective sentences with 5582 effective words took 0s with 4360 sentences/s
2019-08-19 23:51:32,504 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:32,507 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


CBOW-FT-2016 0.3626


2019-08-19 23:51:33,174 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 23:51:33,177 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 23:51:35,262 : MainThread : INFO : begin training
2019-08-19 23:51:35,314 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:35,315 : MainThread : INFO : computing 1 principal components
2019-08-19 23:51:35,320 : MainThread : INFO : removing 1 principal components
2019-08-19 23:51:35,321 : MainThread : INFO : training on 568 effective sentences with 5582 effective words took 0s with 10774 sentences/s
2019-08-19 23:51:35,329 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:35,331 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


SIF-FT-2016 0.5369


2019-08-19 23:51:36,004 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 23:51:36,007 : MainThread : INFO : pre-computing uSIF weights for 2000000 words
2019-08-19 23:51:45,439 : MainThread : INFO : begin training
2019-08-19 23:51:45,497 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:45,498 : MainThread : INFO : computing 5 principal components
2019-08-19 23:51:45,505 : MainThread : INFO : training on 568 effective sentences with 5582 effective words took 0s with 9763 sentences/s
2019-08-19 23:51:45,515 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:45,516 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


uSIF-FT-2016 0.5164


2019-08-19 23:51:46,497 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 23:51:46,544 : MainThread : INFO : begin training
2019-08-19 23:51:46,579 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:46,579 : MainThread : INFO : training on 250 effective sentences with 1696 effective words took 0s with 7152 sentences/s
2019-08-19 23:51:46,584 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:46,585 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


CBOW-W2V-2017 0.679


2019-08-19 23:51:47,463 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 23:51:47,465 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 23:51:50,182 : MainThread : INFO : begin training
2019-08-19 23:51:50,194 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:51:50,194 : MainThread : INFO : computing 1 principal components
2019-08-19 23:51:50,198 : MainThread : INFO : removing 1 principal components
2019-08-19 23:51:50,199 : MainThread : INFO : training on 250 effective sentences with 1696 effective words took 0s with 20528 sentences/s
2019-08-19 23:51:50,203 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:51:50,204 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


SIF-W2V-2017 0.7708


2019-08-19 23:51:51,092 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 23:51:51,094 : MainThread : INFO : pre-computing uSIF weights for 3000000 words
2019-08-19 23:52:04,577 : MainThread : INFO : begin training
2019-08-19 23:52:04,589 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:52:04,590 : MainThread : INFO : computing 5 principal components
2019-08-19 23:52:04,595 : MainThread : INFO : training on 250 effective sentences with 1696 effective words took 0s with 20288 sentences/s
2019-08-19 23:52:04,600 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:52:04,601 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


uSIF-W2V-2017 0.6978


2019-08-19 23:52:05,345 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 23:52:05,378 : MainThread : INFO : begin training
2019-08-19 23:52:05,418 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:52:05,419 : MainThread : INFO : training on 250 effective sentences with 2319 effective words took 0s with 6147 sentences/s
2019-08-19 23:52:05,422 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:52:05,424 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


CBOW-FT-2017 0.5091


2019-08-19 23:52:06,203 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 23:52:06,205 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 23:52:08,512 : MainThread : INFO : begin training
2019-08-19 23:52:08,538 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:52:08,538 : MainThread : INFO : computing 1 principal components
2019-08-19 23:52:08,542 : MainThread : INFO : removing 1 principal components
2019-08-19 23:52:08,543 : MainThread : INFO : training on 250 effective sentences with 2319 effective words took 0s with 9417 sentences/s
2019-08-19 23:52:08,548 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 23:52:08,549 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


SIF-FT-2017 0.7487


2019-08-19 23:52:09,303 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 23:52:09,305 : MainThread : INFO : pre-computing uSIF weights for 2000000 words
2019-08-19 23:52:18,651 : MainThread : INFO : begin training
2019-08-19 23:52:18,678 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 23:52:18,678 : MainThread : INFO : computing 5 principal components
2019-08-19 23:52:18,684 : MainThread : INFO : training on 250 effective sentences with 2319 effective words took 0s with 8998 sentences/s


uSIF-FT-2017 0.6865
time: 3min 21s


In [6]:
pd.DataFrame.from_dict(results, orient="index", columns=["Pearson"])

Unnamed: 0,Pearson
CBOW-W2V-2012,74.67
SIF-W2V-2012,78.74
uSIF-W2V-2012,77.74
CBOW-FT-2012,50.99
SIF-FT-2012,79.1
uSIF-FT-2012,78.62
CBOW-W2V-2013,64.74
SIF-W2V-2013,67.41
uSIF-W2V-2013,69.72
CBOW-FT-2013,65.06


time: 34.4 ms
