# Compute & Compare Sentence Embeddings

In [2]:
%load_ext autotime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from gensim.models import KeyedVectors
from fse.models import Sentence2Vec
import pathlib
import numpy as np
from re import sub
import pandas as pd
from wordfreq import get_frequency_dict

import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.23 ms


Download a pre-trained embedding that is compatible with any of the Gensim models and load it. For example, the original Word2Vec embedding

In [3]:
# Load the pre-trained word2vec model
model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)  

2019-06-07 17:09:09,434 : INFO : loading projection weights from data/GoogleNews-vectors-negative300.bin
2019-06-07 17:11:03,080 : INFO : loaded (3000000, 300) matrix from data/GoogleNews-vectors-negative300.bin


time: 1min 53s


In [4]:
data_path = "data/reddit/"

p = pathlib.Path(data_path)

if not p.exists():
    raise FileNotFoundError("Directory does not exist.")

file_list=[]
for f in p.iterdir():
    if f.is_file():
        file_list.append(f)

data = pd.DataFrame()
        
for i, f in enumerate(file_list):
    df_tmp = pd.read_csv(f)
    df_tmp["label"] = i
    df_tmp = df_tmp[["title", "label"]]
    data = pd.concat([data, df_tmp])
    
min_data = np.min(np.unique(data.label.values, return_counts=True)[1])
labels = np.unique(data.label.values)

data_balanced = pd.DataFrame()

for i in labels:
    data_balanced = pd.concat([data_balanced, data[data["label"] == i].sample(n=min_data, random_state=42)])
    
data_balanced = data_balanced.sample(frac=1)
y = np.array(data_balanced.label.values.tolist())

time: 95.6 ms


In [5]:
def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

data_balanced["title_processed"] = (data_balanced['title'].apply(normalize_text))

corpus = data_balanced["title_processed"].values.tolist()
labels = data_balanced.label.values.tolist()

corpus = [[w for w in s if w in model.vocab] for s in corpus]

time: 58.6 ms


In [6]:
count_vect = CountVectorizer()
x_bow = count_vect.fit_transform([" ".join(s) for s in corpus])
x_tfidf = TfidfTransformer(use_idf=True).fit_transform(x_bow)

time: 53.8 ms


In [7]:
cbow_model = Sentence2Vec(model, alpha=0, components=0, no_frequency=True)

2019-06-07 17:18:59,198 : INFO : pre-computing SIF weights
2019-06-07 17:18:59,199 : INFO : no frequency mode: using wordfreq for estimation (lang=en)


time: 2.13 s


In [8]:
x_cbow = cbow_model.train(corpus)

2019-06-07 17:19:05,326 : INFO : estimated required memory for 2460 sentences and 300 dimensions: 2 MB (0 GB)
2019-06-07 17:19:05,393 : INFO : finished computing sentence embeddings of 2451 effective sentences with 24746 effective words


time: 67.9 ms


In [9]:
sif_model = Sentence2Vec(model, alpha=1e-3, components=1, no_frequency=True)

2019-06-07 17:19:10,267 : INFO : pre-computing SIF weights
2019-06-07 17:19:10,269 : INFO : no frequency mode: using wordfreq for estimation (lang=en)


time: 3.97 s


In [10]:
x_sif = sif_model.train(corpus)

2019-06-07 17:19:15,852 : INFO : estimated required memory for 2460 sentences and 300 dimensions: 2 MB (0 GB)
2019-06-07 17:19:15,888 : INFO : finished computing sentence embeddings of 2451 effective sentences with 24746 effective words
2019-06-07 17:19:15,890 : INFO : computing 1 principal components
2019-06-07 17:19:15,925 : INFO : removing 1 principal components


time: 76.3 ms


In [11]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from datetime import datetime
import pathlib

mds = dict()

mds["BOW"] = x_bow
mds["TFIDF"] = x_tfidf
mds["CBOW"] = x_cbow
mds["SIF"] = x_sif

now = datetime.now()
date_time = now.strftime("%m-%d-%Y_%H-%M-%S")

p = pathlib.Path("excel")
p.mkdir(exist_ok=True)

with pd.ExcelWriter("excel/pcomp_"+date_time+".xlsx") as writer:
    for k in mds.keys():
        x_train, x_test, y_train, y_test = train_test_split(mds[k], labels, test_size=0.5, random_state=42)
        clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        df = pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True)).T
        df.to_excel(writer, sheet_name=k)

time: 946 ms


# STS Benchmark

Download the STS Benchmark Dataset from: http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark.
Some of the lines may be skipped due to errors.

In [12]:
file_path = "data/stsbenchmark/sts-dev.csv"

p = pathlib.Path(file_path)

if not p.exists():
    raise FileNotFoundError("Directory does not exist.")

sts_data = pd.read_csv(file_path, sep="\t", error_bad_lines=False, header=None)
sts_data = sts_data[[5,6,4]]
sts_data.columns = ["A", "B", "sim"]
sts_data.dropna(inplace=True)
sts_data.A = (sts_data.A.apply(normalize_text))
sts_data.B = (sts_data.B.apply(normalize_text))

sents_a = sts_data.A.values.tolist()
sents_b = sts_data.B.values.tolist()
assert len(sents_a) == len(sents_b)

b'Skipping line 1041: expected 7 fields, saw 8\nSkipping line 1065: expected 7 fields, saw 8\nSkipping line 1082: expected 7 fields, saw 8\nSkipping line 1136: expected 7 fields, saw 8\nSkipping line 1149: expected 7 fields, saw 8\nSkipping line 1449: expected 7 fields, saw 9\nSkipping line 1450: expected 7 fields, saw 9\nSkipping line 1451: expected 7 fields, saw 9\nSkipping line 1452: expected 7 fields, saw 9\nSkipping line 1453: expected 7 fields, saw 9\nSkipping line 1454: expected 7 fields, saw 9\nSkipping line 1455: expected 7 fields, saw 9\nSkipping line 1456: expected 7 fields, saw 9\nSkipping line 1457: expected 7 fields, saw 9\nSkipping line 1458: expected 7 fields, saw 9\nSkipping line 1459: expected 7 fields, saw 9\nSkipping line 1460: expected 7 fields, saw 9\nSkipping line 1461: expected 7 fields, saw 9\nSkipping line 1462: expected 7 fields, saw 9\nSkipping line 1463: expected 7 fields, saw 9\nSkipping line 1464: expected 7 fields, saw 9\nSkipping line 1465: expected 7 f

time: 444 ms


In [13]:
cbow_vecs_a = cbow_model.train(sents_a)
cbow_vecs_b = cbow_model.train(sents_b)
cbow_model.normalize(cbow_vecs_a)
cbow_model.normalize(cbow_vecs_b)

sif_vecs_a = sif_model.train(sents_a)
sif_vecs_b = sif_model.train(sents_b)
sif_model.normalize(sif_vecs_a)
sif_model.normalize(sif_vecs_b)

results = pd.DataFrame()
results["STS"] = sts_data.sim

def pearson_correlation(mat_a, mat_b):
    assert mat_a.shape == mat_b.shape
    results = []
    for i in range(len(mat_a)):
        results.append(mat_a[i].dot(mat_b[i]))
    return results

results["CBOW"] = pearson_correlation(cbow_vecs_a, cbow_vecs_b)
results["SIF"] = pearson_correlation(sif_vecs_a, sif_vecs_b)

2019-06-07 17:19:44,721 : INFO : estimated required memory for 1441 sentences and 300 dimensions: 1 MB (0 GB)
2019-06-07 17:19:45,213 : INFO : finished computing sentence embeddings of 1441 effective sentences with 13876 effective words
2019-06-07 17:19:45,214 : INFO : estimated required memory for 1441 sentences and 300 dimensions: 1 MB (0 GB)
2019-06-07 17:19:45,390 : INFO : finished computing sentence embeddings of 1441 effective sentences with 13681 effective words
2019-06-07 17:19:45,391 : INFO : computing L2-norms of sentence embeddings
2019-06-07 17:19:45,407 : INFO : computing L2-norms of sentence embeddings
2019-06-07 17:19:45,421 : INFO : estimated required memory for 1441 sentences and 300 dimensions: 1 MB (0 GB)
2019-06-07 17:19:45,440 : INFO : finished computing sentence embeddings of 1441 effective sentences with 13876 effective words
2019-06-07 17:19:45,441 : INFO : computing 1 principal components
2019-06-07 17:19:45,453 : INFO : removing 1 principal components
2019-06-

time: 808 ms


In [14]:
results = results.corr()
now = datetime.now()
date_time = now.strftime("%m-%d-%Y_%H-%M-%S")
results.to_excel("excel/STScomp_"+date_time+".xlsx")

time: 21.7 ms


In [15]:
results

Unnamed: 0,STS,CBOW,SIF
STS,1.0,0.722721,0.775961
CBOW,0.722721,1.0,0.918188
SIF,0.775961,0.918188,1.0


time: 24.7 ms
