# STS Benchmark Datasets

For more information, see the implementation of https://github.com/kawine/usif

In [19]:
%load_ext autotime
import numpy as np
import logging
import pandas as pd
import pathlib

from gensim.models import Word2Vec, FastText
from gensim.models.keyedvectors import KeyedVectors, FastTextKeyedVectors

from fse.models.average import Average
from fse.models.sif import SIF
from fse.models.inputs import IndexedSentence

from re import sub

from scipy.stats import pearsonr

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
                    level=logging.INFO
                   )

file_path = "data/stsbenchmark/sts-test.csv"

def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

def compute_similarities(task_length, model):
    sims = []
    for i, j in zip(range(task_length), range(task_length, 2*task_length)):
        sims.append(model.sv.similarity(i,j))
    return sims

sts_data = pd.read_csv(file_path, sep=",", error_bad_lines=True, names=["cpt","1","task","3","sim","sent_a","sent_b"])
sts_data.dropna(inplace=True)
sts_data.sent_a = sts_data.sent_a.apply(normalize_text)
sts_data.sent_b = sts_data.sent_b.apply(normalize_text)
sts_data.task = sts_data.task.str.replace("2012test", "2012")
sts_data.task = sts_data.task.str.replace("2012train", "None")
sts_data = sts_data[sts_data.task != "None"]

tasks = sorted(["2012", "2013", "2014", "2015", "2016", "2017"])

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 88.4 ms


In [20]:
w2v = KeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model")
ft = FastTextKeyedVectors.load("/Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model")

2019-08-19 20:46:41,037 : MainThread : INFO : loading Word2VecKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-19 20:46:47,952 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model.vectors.npy with mmap=None
2019-08-19 20:46:52,990 : MainThread : INFO : setting ignored attribute vectors_norm to None
2019-08-19 20:46:53,000 : MainThread : INFO : loaded /Users/oliverborchers/Desktop/GSDEV/Models/Static/google_news.model
2019-08-19 20:46:53,002 : MainThread : INFO : loading FastTextKeyedVectors object from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model
2019-08-19 20:47:01,238 : MainThread : INFO : loading vectors from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.model.vectors.npy with mmap=None
2019-08-19 20:47:04,638 : MainThread : INFO : loading vectors_vocab from /Users/oliverborchers/Desktop/GSDEV/Models/Static/ft_crawl_300d_2m.mo

time: 31.9 s


In [21]:
w2v_avg = Average(w2v, wv_mapfile_path = "data/vectors/w2v", lang_freq="en")
ft_avg = Average(ft, wv_mapfile_path = "data/vectors/ft", lang_freq="en")

  "C extension not loaded, training/inferring will be slow. "
2019-08-19 20:47:13,036 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v
2019-08-19 20:47:14,178 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en
2019-08-19 20:47:20,020 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/ft
2019-08-19 20:47:20,543 : MainThread : INFO : loading pre-existing vocab from data/vectors/ft
2019-08-19 20:47:21,062 : MainThread : INFO : loading pre-existing ngrams from data/vectors/ft
2019-08-19 20:47:21,568 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en


time: 13.1 s


In [22]:
models, results = {}, {}

for year in tasks:
    models[f"CBOW-W2V-{year}"] = Average(w2v, wv_mapfile_path = "data/vectors/w2v")
    models[f"SIF-W2V-{year}"] = SIF(w2v, wv_mapfile_path = "data/vectors/w2v")
    
    models[f"CBOW-FT-{year}"] = Average(ft, wv_mapfile_path = "data/vectors/ft")
    models[f"SIF-FT-{year}"] = SIF(ft, wv_mapfile_path = "data/vectors/ft")
    
for k in models:
    results[k] = 0

2019-08-19 20:47:33,785 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v
2019-08-19 20:47:33,807 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/w2v
2019-08-19 20:47:33,854 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/ft
2019-08-19 20:47:33,874 : MainThread : INFO : loading pre-existing vocab from data/vectors/ft
2019-08-19 20:47:33,893 : MainThread : INFO : loading pre-existing ngrams from data/vectors/ft
2019-08-19 20:47:33,927 : MainThread : INFO : loading pre-existing wv from /Users/oliverborchers/Library/Mobile Documents/com~apple~CloudDocs/Diss/Medium/Fast_Sentence_Embeddings/fse/data/vectors/ft
2019-08-19 20

time: 1.16 s


In [23]:
for year in tasks:
    df = sts_data[sts_data.task == year]
    sents_a = df.sent_a.values.tolist()
    sents_b = df.sent_b.values.tolist()
    assert len(sents_a) == len(sents_b)

    task_length = len(sents_a)
    sents_a.extend(sents_b)
    
    sents = [IndexedSentence(s, i) for i,s in enumerate(sents_a)]
    
    for k, m in models.items():
        if k.split("-")[-1] != year:
            continue
        
        m_type  = k.split("-")[0]
        emb_type = k.split("-")[1]
        
        m.train(sents)
        r = pearsonr(df.sim, compute_similarities(task_length, m))[0].round(4)
        
        results[f"{m_type}-{emb_type}-{year}"] = r * 100
        print(k, r)

2019-08-19 20:49:04,859 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:04,863 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words
2019-08-19 20:49:06,113 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:06,114 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 20:49:06,174 : MainThread : INFO : begin training
2019-08-19 20:49:06,397 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:06,399 : MainThread : INFO : training on 504 effective sentences with 5060 effective words took 0s with 2236 sentences/s
2019-08-19 20:49:06,446 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:06,448 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


CBOW-W2V-2012 0.7467


2019-08-19 20:49:07,700 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:07,701 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 20:49:07,703 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 20:49:11,334 : MainThread : INFO : begin training
2019-08-19 20:49:11,368 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:11,369 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:11,406 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:11,408 : MainThread : INFO : training on 504 effective sentences with 5060 effective words took 0s with 14249 sentences/s
2019-08-19 20:49:11,416 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:11,419 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 

SIF-W2V-2012 0.7874


2019-08-19 20:49:12,187 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 20:49:12,187 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 20:49:12,228 : MainThread : INFO : begin training
2019-08-19 20:49:12,830 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:12,831 : MainThread : INFO : training on 504 effective sentences with 6848 effective words took 0s with 836 sentences/s
2019-08-19 20:49:12,838 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:12,840 : MainThread : INFO : finished scanning 504 sentences with an average length of 13 and 6848 total words


CBOW-FT-2012 0.5099


2019-08-19 20:49:13,683 : MainThread : INFO : estimated memory for 504 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 20:49:13,683 : MainThread : INFO : initializing sentence vectors for 504 sentences
2019-08-19 20:49:13,686 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 20:49:16,126 : MainThread : INFO : begin training
2019-08-19 20:49:16,209 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:16,210 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:16,222 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:16,223 : MainThread : INFO : training on 504 effective sentences with 6848 effective words took 0s with 6039 sentences/s
2019-08-19 20:49:16,235 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:16,236 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 to

SIF-FT-2012 0.791


2019-08-19 20:49:17,500 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 20:49:17,501 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 20:49:17,560 : MainThread : INFO : begin training
2019-08-19 20:49:17,622 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:17,623 : MainThread : INFO : training on 144 effective sentences with 944 effective words took 0s with 2291 sentences/s
2019-08-19 20:49:17,627 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:17,628 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


CBOW-W2V-2013 0.6474


2019-08-19 20:49:18,913 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 20:49:18,914 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 20:49:18,916 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 20:49:22,562 : MainThread : INFO : begin training
2019-08-19 20:49:22,571 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:22,572 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:22,576 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:22,577 : MainThread : INFO : training on 144 effective sentences with 944 effective words took 0s with 15062 sentences/s
2019-08-19 20:49:22,581 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:22,583 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 to

SIF-W2V-2013 0.6741


2019-08-19 20:49:23,425 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:49:23,426 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 20:49:23,463 : MainThread : INFO : begin training
2019-08-19 20:49:23,575 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:23,576 : MainThread : INFO : training on 144 effective sentences with 1101 effective words took 0s with 1267 sentences/s
2019-08-19 20:49:23,579 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:23,580 : MainThread : INFO : finished scanning 144 sentences with an average length of 7 and 1101 total words


CBOW-FT-2013 0.6506


2019-08-19 20:49:24,400 : MainThread : INFO : estimated memory for 144 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:49:24,401 : MainThread : INFO : initializing sentence vectors for 144 sentences
2019-08-19 20:49:24,402 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 20:49:26,894 : MainThread : INFO : begin training
2019-08-19 20:49:26,917 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:26,918 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:26,925 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:26,927 : MainThread : INFO : training on 144 effective sentences with 1101 effective words took 0s with 6051 sentences/s
2019-08-19 20:49:26,934 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:26,937 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 to

SIF-FT-2013 0.7039


2019-08-19 20:49:28,220 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:28,221 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 20:49:28,275 : MainThread : INFO : begin training
2019-08-19 20:49:28,356 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:28,357 : MainThread : INFO : training on 404 effective sentences with 2833 effective words took 0s with 4958 sentences/s
2019-08-19 20:49:28,365 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:28,367 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


CBOW-W2V-2014 0.7105


2019-08-19 20:49:29,659 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:29,660 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 20:49:29,662 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 20:49:33,314 : MainThread : INFO : begin training
2019-08-19 20:49:33,335 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:33,336 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:33,344 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:33,345 : MainThread : INFO : training on 404 effective sentences with 2833 effective words took 0s with 18419 sentences/s
2019-08-19 20:49:33,352 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:33,354 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 t

SIF-W2V-2014 0.755


2019-08-19 20:49:34,201 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:49:34,202 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 20:49:34,240 : MainThread : INFO : begin training
2019-08-19 20:49:34,370 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:34,371 : MainThread : INFO : training on 404 effective sentences with 3763 effective words took 0s with 3064 sentences/s
2019-08-19 20:49:34,382 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:34,384 : MainThread : INFO : finished scanning 404 sentences with an average length of 9 and 3763 total words


CBOW-FT-2014 0.4882


2019-08-19 20:49:35,266 : MainThread : INFO : estimated memory for 404 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:49:35,267 : MainThread : INFO : initializing sentence vectors for 404 sentences
2019-08-19 20:49:35,269 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 20:49:37,707 : MainThread : INFO : begin training
2019-08-19 20:49:37,755 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:37,756 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:37,761 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:37,763 : MainThread : INFO : training on 404 effective sentences with 3763 effective words took 0s with 8224 sentences/s
2019-08-19 20:49:37,772 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:37,774 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 to

SIF-FT-2014 0.7789


2019-08-19 20:49:39,041 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:39,042 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 20:49:39,098 : MainThread : INFO : begin training
2019-08-19 20:49:39,171 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:39,172 : MainThread : INFO : training on 392 effective sentences with 2849 effective words took 0s with 5254 sentences/s
2019-08-19 20:49:39,179 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:39,181 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


CBOW-W2V-2015 0.732


2019-08-19 20:49:40,420 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:40,421 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 20:49:40,423 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 20:49:44,068 : MainThread : INFO : begin training
2019-08-19 20:49:44,088 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:44,089 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:44,094 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:44,095 : MainThread : INFO : training on 392 effective sentences with 2849 effective words took 0s with 19355 sentences/s
2019-08-19 20:49:44,103 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:44,105 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 t

SIF-W2V-2015 0.783


2019-08-19 20:49:44,930 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:49:44,931 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 20:49:44,967 : MainThread : INFO : begin training
2019-08-19 20:49:45,115 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:45,116 : MainThread : INFO : training on 392 effective sentences with 3640 effective words took 0s with 2625 sentences/s
2019-08-19 20:49:45,121 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:45,123 : MainThread : INFO : finished scanning 392 sentences with an average length of 9 and 3640 total words


CBOW-FT-2015 0.579


2019-08-19 20:49:45,974 : MainThread : INFO : estimated memory for 392 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:49:45,975 : MainThread : INFO : initializing sentence vectors for 392 sentences
2019-08-19 20:49:45,977 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 20:49:48,415 : MainThread : INFO : begin training
2019-08-19 20:49:48,461 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:48,461 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:48,467 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:48,468 : MainThread : INFO : training on 392 effective sentences with 3640 effective words took 0s with 8446 sentences/s
2019-08-19 20:49:48,477 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:48,481 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 to

SIF-FT-2015 0.7961


2019-08-19 20:49:49,726 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:49,726 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 20:49:49,779 : MainThread : INFO : begin training
2019-08-19 20:49:49,838 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:49,839 : MainThread : INFO : training on 568 effective sentences with 4534 effective words took 0s with 9577 sentences/s
2019-08-19 20:49:49,846 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:49,849 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


CBOW-W2V-2016 0.4194


2019-08-19 20:49:51,122 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 3000000 vocabulary: 3445 MB (3 GB)
2019-08-19 20:49:51,123 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 20:49:51,125 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 20:49:54,947 : MainThread : INFO : begin training
2019-08-19 20:49:54,974 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:54,975 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:54,980 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:54,982 : MainThread : INFO : training on 568 effective sentences with 4534 effective words took 0s with 20367 sentences/s
2019-08-19 20:49:54,992 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:54,996 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 t

SIF-W2V-2016 0.5533


2019-08-19 20:49:55,841 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 20:49:55,842 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 20:49:55,882 : MainThread : INFO : begin training
2019-08-19 20:49:56,002 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:56,003 : MainThread : INFO : training on 568 effective sentences with 5582 effective words took 0s with 4692 sentences/s
2019-08-19 20:49:56,012 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:56,014 : MainThread : INFO : finished scanning 568 sentences with an average length of 9 and 5582 total words


CBOW-FT-2016 0.3626


2019-08-19 20:49:56,847 : MainThread : INFO : estimated memory for 568 sentences with 300 dimensions and 2000000 vocabulary: 2297 MB (2 GB)
2019-08-19 20:49:56,848 : MainThread : INFO : initializing sentence vectors for 568 sentences
2019-08-19 20:49:56,851 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 20:49:59,281 : MainThread : INFO : begin training
2019-08-19 20:49:59,348 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:49:59,349 : MainThread : INFO : computing 1 principal components
2019-08-19 20:49:59,355 : MainThread : INFO : removing 1 principal components
2019-08-19 20:49:59,356 : MainThread : INFO : training on 568 effective sentences with 5582 effective words took 0s with 8317 sentences/s
2019-08-19 20:49:59,370 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:49:59,372 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 to

SIF-FT-2016 0.5369


2019-08-19 20:50:00,620 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 20:50:00,621 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 20:50:00,674 : MainThread : INFO : begin training
2019-08-19 20:50:00,707 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:50:00,708 : MainThread : INFO : training on 250 effective sentences with 1696 effective words took 0s with 7436 sentences/s
2019-08-19 20:50:00,715 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:50:00,716 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


CBOW-W2V-2017 0.679


2019-08-19 20:50:01,997 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 3000000 vocabulary: 3444 MB (3 GB)
2019-08-19 20:50:01,997 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 20:50:01,999 : MainThread : INFO : pre-computing SIF weights for 3000000 words
2019-08-19 20:50:05,621 : MainThread : INFO : begin training
2019-08-19 20:50:05,635 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:50:05,636 : MainThread : INFO : computing 1 principal components
2019-08-19 20:50:05,640 : MainThread : INFO : removing 1 principal components
2019-08-19 20:50:05,642 : MainThread : INFO : training on 250 effective sentences with 1696 effective words took 0s with 17379 sentences/s
2019-08-19 20:50:05,648 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:50:05,650 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 t

SIF-W2V-2017 0.7708


2019-08-19 20:50:06,491 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:50:06,492 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 20:50:06,528 : MainThread : INFO : begin training
2019-08-19 20:50:06,571 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:50:06,572 : MainThread : INFO : training on 250 effective sentences with 2319 effective words took 0s with 5679 sentences/s
2019-08-19 20:50:06,576 : MainThread : INFO : scanning all indexed sentences and their word counts
2019-08-19 20:50:06,577 : MainThread : INFO : finished scanning 250 sentences with an average length of 9 and 2319 total words


CBOW-FT-2017 0.5091


2019-08-19 20:50:07,438 : MainThread : INFO : estimated memory for 250 sentences with 300 dimensions and 2000000 vocabulary: 2296 MB (2 GB)
2019-08-19 20:50:07,438 : MainThread : INFO : initializing sentence vectors for 250 sentences
2019-08-19 20:50:07,440 : MainThread : INFO : pre-computing SIF weights for 2000000 words
2019-08-19 20:50:09,896 : MainThread : INFO : begin training
2019-08-19 20:50:09,926 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2019-08-19 20:50:09,927 : MainThread : INFO : computing 1 principal components
2019-08-19 20:50:09,932 : MainThread : INFO : removing 1 principal components
2019-08-19 20:50:09,933 : MainThread : INFO : training on 250 effective sentences with 2319 effective words took 0s with 8113 sentences/s


SIF-FT-2017 0.7487
time: 1min 5s


In [24]:
pd.DataFrame.from_dict(results, orient="index", columns=["Pearson"])

Unnamed: 0,Pearson
CBOW-W2V-2012,74.67
SIF-W2V-2012,78.74
CBOW-FT-2012,50.99
SIF-FT-2012,79.1
CBOW-W2V-2013,64.74
SIF-W2V-2013,67.41
CBOW-FT-2013,65.06
SIF-FT-2013,70.39
CBOW-W2V-2014,71.05
SIF-W2V-2014,75.5


time: 22.3 ms
