# STS Benchmark Datasets

For more information, see the implementation of https://github.com/kawine/usif

In [1]:
%load_ext autotime
import numpy as np
import logging
import pandas as pd
import pathlib

from gensim.models import Word2Vec, FastText
from gensim.models.keyedvectors import KeyedVectors, FastTextKeyedVectors

from fse.models.average import Average
from fse.models.sif import SIF
from fse.models.inputs import IndexedSentence

from re import sub

from scipy.stats import pearsonr

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
                    level=logging.INFO
                   )

file_path = "data/stsbenchmark/sts-test.csv"

def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    return sims

sts_data = pd.read_csv(file_path, sep=",", error_bad_lines=True, names=["cpt","1","task","3","sim","sent_a","sent_b"])
sts_data.dropna(inplace=True)
sts_data.sent_a = sts_data.sent_a.apply(normalize_text)
sts_data.sent_b = sts_data.sent_b.apply(normalize_text)
sts_data.task = sts_data.task.str.replace("2012test", "2012")
sts_data.task = sts_data.task.str.replace("2012train", "None")
sts_data = sts_data[sts_data.task != "None"]

w2v = KeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model")
ft = FastTextKeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model")

tasks = sorted(["2012", "2013", "2014", "2015", "2016", "2017"])

2019-08-19 18:21:47,341 : MainThread : INFO : loading Word2VecKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-19 18:21:53,096 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model.vectors.npy with mmap=None
2019-08-19 18:21:57,559 : MainThread : INFO : setting ignored attribute vectors_norm to None
2019-08-19 18:21:57,560 : MainThread : INFO : loaded /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-19 18:21:57,561 : MainThread : INFO : loading FastTextKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model
2019-08-19 18:22:03,575 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model.vectors.npy with mmap=None
2019-08-19 18:22:06,281 : MainThread : INFO : loading vectors_vocab from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.mo

In [2]:
w2v_avg = Average(w2v, wv_mapfile_path = "data/vectors/w2v", lang_freq="en")
ft_avg = Average(ft, wv_mapfile_path = "data/vectors/ft", lang_freq="en")

  "C extension not loaded, training/inferring will be slow. "
2019-08-19 18:22:13,539 : MainThread : INFO : loading pre-existing wv from data/vectors/w2v
2019-08-19 18:22:13,890 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2019-08-19 18:22:18,615 : MainThread : INFO : loading pre-existing wv from data/vectors/ft
2019-08-19 18:22:19,035 : MainThread : INFO : loading pre-existing ngrams from data/vectors/ft
2019-08-19 18:22:19,427 : MainThread : INFO : writing vocab to data/vectors/ft
2019-08-19 18:22:25,126 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en


time: 14 s


In [6]:
models, results = {}, {}

for year in tasks:
    models[f"CBOW-W2V-{year}"] = Average(w2v, wv_mapfile_path = "data/vectors/w2v")
    models[f"SIF-W2V-{year}"] = SIF(w2v, wv_mapfile_path = "data/vectors/w2v")
    
    models[f"CBOW-FT-{year}"] = Average(ft, wv_mapfile_path = "data/vectors/ft")
    models[f"SIF-FT-{year}"] = SIF(ft, wv_mapfile_path = "data/vectors/ft")
    
for k in models:
    results[k] = 0

  "C extension not loaded, training/inferring will be slow. "
2019-08-19 18:25:01,749 : MainThread : INFO : loading pre-existing wv from data/vectors/w2v
2019-08-19 18:25:01,767 : MainThread : INFO : loading pre-existing wv from data/vectors/w2v
2019-08-19 18:25:01,787 : MainThread : INFO : loading pre-existing wv from data/vectors/ft
2019-08-19 18:25:01,803 : MainThread : INFO : loading pre-existing ngrams from data/vectors/ft
2019-08-19 18:25:01,821 : MainThread : INFO : loading pre-existing vocab from data/vectors/ft
2019-08-19 18:25:01,840 : MainThread : INFO : loading pre-existing wv from data/vectors/ft
2019-08-19 18:25:01,854 : MainThread : INFO : loading pre-existing ngrams from data/vectors/ft
2019-08-19 18:25:01,871 : MainThread : INFO : loading pre-existing vocab from data/vectors/ft
2019-08-19 18:25:01,887 : MainThread : INFO : loading pre-existing wv from data/vectors/w2v
2019-08-19 18:25:01,904 : MainThread : INFO : loading pre-existing wv from data/vectors/w2v
2019-08-19

time: 840 ms


In [7]:
for year in tasks:
    df = sts_data[sts_data.task == year]
    sents_a = df.sent_a.values.tolist()
    sents_b = df.sent_b.values.tolist()
    assert len(sents_a) == len(sents_b)

    task_length = len(sents_a)
    sents_a.extend(sents_b)
    
    sents = [IndexedSentence(s, i) for i,s in enumerate(sents_a)]
    
    for k, m in models.items():
        if k.split("-")[-1] != year:
            continue
        
        m_type  = k.split("-")[0]
        emb_type = k.split("-")[1]
        
        m.train(sents)
        r = pearsonr(df.sim, compute_similarities(task_length, m))[0].round(4)
        
        results[f"{m_type}-{emb_type}-{year}"] = r * 100
        print(k, r)

2019-08-19 18:25:03,726 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:03,730 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words
2019-08-19 18:25:04,920 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:04,921 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 18:25:04,985 : MainThread : INFO : begin training
2019-08-19 18:25:05,035 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:05,036 : MainThread : INFO : training on 504 effective sentences with 5060 effective words took 0s with 9805 sentences/s
2019-08-19 18:25:05,046 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:05,052 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


CBOW-W2V-2012 0.7467


2019-08-19 18:25:06,426 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:06,427 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 18:25:06,431 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 18:25:11,856 : MainThread : INFO : begin training
2019-08-19 18:25:11,904 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:11,905 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:11,913 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:11,915 : MainThread : INFO : training on 504 effective sentences with 5060 effective words took 0s with 10203 sentences/s
2019-08-19 18:25:11,930 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:11,935 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 

SIF-W2V-2012 0.7874


2019-08-19 18:25:12,896 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 18:25:12,897 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 18:25:12,941 : MainThread : INFO : begin training
2019-08-19 18:25:13,050 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:13,051 : MainThread : INFO : training on 504 effective sentences with 6848 effective words took 0s with 4607 sentences/s
2019-08-19 18:25:13,060 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:13,063 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


CBOW-FT-2012 0.5099


2019-08-19 18:25:13,973 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 18:25:13,973 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 18:25:13,976 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 18:25:17,022 : MainThread : INFO : begin training
2019-08-19 18:25:17,123 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:17,124 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:17,130 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:17,132 : MainThread : INFO : training on 504 effective sentences with 6848 effective words took 0s with 4971 sentences/s
2019-08-19 18:25:17,144 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:17,145 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 to

SIF-FT-2012 0.791


2019-08-19 18:25:18,343 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 18:25:18,344 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 18:25:18,404 : MainThread : INFO : begin training
2019-08-19 18:25:18,414 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:18,415 : MainThread : INFO : training on 144 effective sentences with 944 effective words took 0s with 13250 sentences/s
2019-08-19 18:25:18,418 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:18,419 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


CBOW-W2V-2013 0.6474


2019-08-19 18:25:19,567 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 18:25:19,568 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 18:25:19,569 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 18:25:23,006 : MainThread : INFO : begin training
2019-08-19 18:25:23,018 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:23,019 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:23,022 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:23,024 : MainThread : INFO : training on 144 effective sentences with 944 effective words took 0s with 11007 sentences/s
2019-08-19 18:25:23,029 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:23,031 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 to

SIF-W2V-2013 0.6741


2019-08-19 18:25:24,408 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:25:24,408 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 18:25:24,448 : MainThread : INFO : begin training
2019-08-19 18:25:24,474 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:24,476 : MainThread : INFO : training on 144 effective sentences with 1101 effective words took 0s with 5123 sentences/s
2019-08-19 18:25:24,499 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:24,506 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


CBOW-FT-2013 0.6506


2019-08-19 18:25:25,652 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:25:25,653 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 18:25:25,655 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 18:25:28,152 : MainThread : INFO : begin training
2019-08-19 18:25:28,171 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:28,172 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:28,180 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:28,181 : MainThread : INFO : training on 144 effective sentences with 1101 effective words took 0s with 7181 sentences/s
2019-08-19 18:25:28,190 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:28,193 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 to

SIF-FT-2013 0.7039


2019-08-19 18:25:29,384 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:29,385 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 18:25:29,438 : MainThread : INFO : begin training
2019-08-19 18:25:29,461 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:29,462 : MainThread : INFO : training on 404 effective sentences with 2833 effective words took 0s with 17112 sentences/s
2019-08-19 18:25:29,469 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:29,471 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


CBOW-W2V-2014 0.7105


2019-08-19 18:25:31,326 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:31,329 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 18:25:31,340 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 18:25:39,172 : MainThread : INFO : begin training
2019-08-19 18:25:39,202 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:39,204 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:39,213 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:39,215 : MainThread : INFO : training on 404 effective sentences with 2833 effective words took 0s with 12860 sentences/s
2019-08-19 18:25:39,230 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:39,235 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 t

SIF-W2V-2014 0.755


2019-08-19 18:25:40,264 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:25:40,265 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 18:25:40,310 : MainThread : INFO : begin training
2019-08-19 18:25:40,372 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:40,373 : MainThread : INFO : training on 404 effective sentences with 3763 effective words took 0s with 6386 sentences/s
2019-08-19 18:25:40,381 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:40,383 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


CBOW-FT-2014 0.4882


2019-08-19 18:25:41,337 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:25:41,338 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 18:25:41,341 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 18:25:43,867 : MainThread : INFO : begin training
2019-08-19 18:25:43,920 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:43,920 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:43,925 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:43,927 : MainThread : INFO : training on 404 effective sentences with 3763 effective words took 0s with 7591 sentences/s
2019-08-19 18:25:43,938 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:43,941 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 to

SIF-FT-2014 0.7789


2019-08-19 18:25:45,125 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:45,126 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 18:25:45,179 : MainThread : INFO : begin training
2019-08-19 18:25:45,202 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:45,203 : MainThread : INFO : training on 392 effective sentences with 2849 effective words took 0s with 16532 sentences/s
2019-08-19 18:25:45,209 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:45,211 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


CBOW-W2V-2015 0.732


2019-08-19 18:25:46,355 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:46,356 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 18:25:46,359 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 18:25:49,771 : MainThread : INFO : begin training
2019-08-19 18:25:49,793 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:49,794 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:49,801 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:49,802 : MainThread : INFO : training on 392 effective sentences with 2849 effective words took 0s with 17383 sentences/s
2019-08-19 18:25:49,811 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:49,813 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 t

SIF-W2V-2015 0.783


2019-08-19 18:25:50,840 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:25:50,841 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 18:25:50,877 : MainThread : INFO : begin training
2019-08-19 18:25:50,926 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:50,926 : MainThread : INFO : training on 392 effective sentences with 3640 effective words took 0s with 7947 sentences/s
2019-08-19 18:25:50,931 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:50,933 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


CBOW-FT-2015 0.579


2019-08-19 18:25:51,783 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:25:51,784 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 18:25:51,786 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 18:25:54,007 : MainThread : INFO : begin training
2019-08-19 18:25:54,056 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:54,057 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:54,062 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:54,063 : MainThread : INFO : training on 392 effective sentences with 3640 effective words took 0s with 7836 sentences/s
2019-08-19 18:25:54,072 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:54,074 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 to

SIF-FT-2015 0.7961


2019-08-19 18:25:55,071 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:55,071 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 18:25:55,121 : MainThread : INFO : begin training
2019-08-19 18:25:55,146 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:55,146 : MainThread : INFO : training on 568 effective sentences with 4534 effective words took 0s with 22511 sentences/s
2019-08-19 18:25:55,153 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:55,155 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


CBOW-W2V-2016 0.4194


2019-08-19 18:25:56,171 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 18:25:56,172 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 18:25:56,174 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 18:25:59,181 : MainThread : INFO : begin training
2019-08-19 18:25:59,205 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:25:59,205 : MainThread : INFO : computing 1 principal components
2019-08-19 18:25:59,210 : MainThread : INFO : removing 1 principal components
2019-08-19 18:25:59,212 : MainThread : INFO : training on 568 effective sentences with 4534 effective words took 0s with 23186 sentences/s
2019-08-19 18:25:59,222 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:25:59,224 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 t

SIF-W2V-2016 0.5533


2019-08-19 18:25:59,981 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 18:25:59,982 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 18:26:00,017 : MainThread : INFO : begin training
2019-08-19 18:26:00,075 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:26:00,076 : MainThread : INFO : training on 568 effective sentences with 5582 effective words took 0s with 9511 sentences/s
2019-08-19 18:26:00,082 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:26:00,085 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


CBOW-FT-2016 0.3626


2019-08-19 18:26:00,873 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 18:26:00,873 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 18:26:00,875 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 18:26:03,054 : MainThread : INFO : begin training
2019-08-19 18:26:03,117 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:26:03,118 : MainThread : INFO : computing 1 principal components
2019-08-19 18:26:03,123 : MainThread : INFO : removing 1 principal components
2019-08-19 18:26:03,125 : MainThread : INFO : training on 568 effective sentences with 5582 effective words took 0s with 8903 sentences/s
2019-08-19 18:26:03,136 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:26:03,138 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 to

SIF-FT-2016 0.5369


2019-08-19 18:26:04,139 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 18:26:04,140 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 18:26:04,188 : MainThread : INFO : begin training
2019-08-19 18:26:04,199 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:26:04,200 : MainThread : INFO : training on 250 effective sentences with 1696 effective words took 0s with 20826 sentences/s
2019-08-19 18:26:04,204 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:26:04,205 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


CBOW-W2V-2017 0.679


2019-08-19 18:26:05,222 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 18:26:05,223 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 18:26:05,225 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 18:26:08,185 : MainThread : INFO : begin training
2019-08-19 18:26:08,196 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:26:08,196 : MainThread : INFO : computing 1 principal components
2019-08-19 18:26:08,200 : MainThread : INFO : removing 1 principal components
2019-08-19 18:26:08,201 : MainThread : INFO : training on 250 effective sentences with 1696 effective words took 0s with 21574 sentences/s
2019-08-19 18:26:08,205 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:26:08,206 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 t

SIF-W2V-2017 0.7708


2019-08-19 18:26:09,012 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:26:09,013 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 18:26:09,046 : MainThread : INFO : begin training
2019-08-19 18:26:09,073 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:26:09,073 : MainThread : INFO : training on 250 effective sentences with 2319 effective words took 0s with 9339 sentences/s
2019-08-19 18:26:09,077 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 18:26:09,078 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


CBOW-FT-2017 0.5091


2019-08-19 18:26:09,870 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 18:26:09,870 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 18:26:09,872 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 18:26:12,149 : MainThread : INFO : begin training
2019-08-19 18:26:12,175 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 18:26:12,176 : MainThread : INFO : computing 1 principal components
2019-08-19 18:26:12,179 : MainThread : INFO : removing 1 principal components
2019-08-19 18:26:12,180 : MainThread : INFO : training on 250 effective sentences with 2319 effective words took 0s with 9215 sentences/s


SIF-FT-2017 0.7487
time: 1min 8s


In [8]:
pd.DataFrame.from_dict(results, orient="index", columns=["Pearson"])

Unnamed: 0,Pearson
CBOW-W2V-2012,74.67
SIF-W2V-2012,78.74
CBOW-FT-2012,50.99
SIF-FT-2012,79.1
CBOW-W2V-2013,64.74
SIF-W2V-2013,67.41
CBOW-FT-2013,65.06
SIF-FT-2013,70.39
CBOW-W2V-2014,71.05
SIF-W2V-2014,75.5


time: 7.5 ms
