# Compute & Compare Sentence Embeddings

In [1]:
%load_ext autotime
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from gensim.models import KeyedVectors
from fse.models import Sentence2Vec
import pathlib
import numpy as np
from re import sub
import pandas as pd
from wordfreq import get_frequency_dict

import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Download a pre-trained embedding that is compatible with any of the Gensim models and load it. For example, the original Word2Vec embedding

In [2]:
# Load if not on disk
p = pathlib.Path("data/GoogleNews-vectors-negative300.bin")
if not p.exists():
    !wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz -P data/
    !gunzip data/GoogleNews-vectors-negative300.bin.gz

time: 1.35 ms


In [3]:
# Load the pre-trained word2vec model
model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)  

2019-06-17 09:05:30,571 : INFO : loading projection weights from data/GoogleNews-vectors-negative300.bin
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-06-17 09:06:23,187 : INFO : loaded (3000000, 300) matrix from data/GoogleNews-vectors-negative300.bin


time: 52.6 s


In [4]:
data_path = "data/reddit/"

p = pathlib.Path(data_path)

if not p.exists():
    raise FileNotFoundError("Directory does not exist.")

file_list=[]
for f in p.iterdir():
    if f.is_file():
        file_list.append(f)

data = pd.DataFrame()
        
for i, f in enumerate(file_list):
    df_tmp = pd.read_csv(f)
    df_tmp["label"] = i
    df_tmp = df_tmp[["title", "label"]]
    data = pd.concat([data, df_tmp])
    
min_data = np.min(np.unique(data.label.values, return_counts=True)[1])
labels = np.unique(data.label.values)

data_balanced = pd.DataFrame()

for i in labels:
    data_balanced = pd.concat([data_balanced, data[data["label"] == i].sample(n=min_data, random_state=42)])
    
data_balanced = data_balanced.sample(frac=1)
y = np.array(data_balanced.label.values.tolist())

time: 86 ms


In [5]:
def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 

data_balanced["title_processed"] = (data_balanced['title'].apply(normalize_text))

corpus = data_balanced["title_processed"].values.tolist()
labels = data_balanced.label.values.tolist()

corpus = [[w for w in s if w in model.vocab] for s in corpus]

time: 54.6 ms


In [6]:
print(len(corpus))

2460
time: 582 µs


In [7]:
count_vect = CountVectorizer()
x_bow = count_vect.fit_transform([" ".join(s) for s in corpus])
x_tfidf = TfidfTransformer(use_idf=True).fit_transform(x_bow)

time: 45.6 ms


In [8]:
cbow_model = Sentence2Vec(model, alpha=0, components=0, no_frequency=True)

2019-06-17 09:08:50,392 : INFO : pre-computing SIF weights
2019-06-17 09:08:50,394 : INFO : no frequency mode: using wordfreq for estimation (lang=en)


time: 2.06 s


In [9]:
x_cbow = cbow_model.train(corpus)

2019-06-17 09:08:52,455 : INFO : estimated required memory for 2460 sentences and 300 dimensions: 2 MB (0 GB)
2019-06-17 09:08:52,520 : INFO : finished computing sentence embeddings of 2451 effective sentences with 24746 effective words


time: 66.4 ms


In [10]:
sif_model = Sentence2Vec(model, alpha=1e-3, components=1, no_frequency=True)

2019-06-17 09:08:52,528 : INFO : pre-computing SIF weights
2019-06-17 09:08:52,530 : INFO : no frequency mode: using wordfreq for estimation (lang=en)


time: 3.86 s


In [11]:
x_sif = sif_model.train(corpus)

2019-06-17 09:08:56,390 : INFO : estimated required memory for 2460 sentences and 300 dimensions: 2 MB (0 GB)
2019-06-17 09:08:56,422 : INFO : finished computing sentence embeddings of 2451 effective sentences with 24746 effective words
2019-06-17 09:08:56,422 : INFO : computing 1 principal components
2019-06-17 09:08:56,453 : INFO : removing 1 principal components


time: 67.8 ms


Comparision for timing purposes

In [12]:
from fse.exp.sif_variants import sif_embeddings_1
logging.disable(logging.INFO)

time: 8.13 ms


In [13]:
%%timeit
# This function does not remove the principal component
sif_embeddings_1(corpus, model)

983 ms ± 27.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
time: 7.95 s


In [14]:
%%timeit
cbow_model.train(corpus)

24.1 ms ± 94.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
time: 1.95 s


In [15]:
logging.disable(logging.NOTSET)

time: 560 µs


In [16]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from datetime import datetime
import pathlib

mds = dict()

mds["BOW"] = x_bow
mds["TFIDF"] = x_tfidf
mds["CBOW"] = x_cbow
mds["SIF"] = x_sif

now = datetime.now()
date_time = now.strftime("%m-%d-%Y_%H-%M-%S")

p = pathlib.Path("excel")
p.mkdir(exist_ok=True)

with pd.ExcelWriter("excel/pcomp_"+date_time+".xlsx") as writer:
    for k in mds.keys():
        x_train, x_test, y_train, y_test = train_test_split(mds[k], labels, test_size=0.5, random_state=42)
        clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(x_train, y_train)
        y_pred = clf.predict(x_test)

        df = pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True)).T
        df.to_excel(writer, sheet_name=k)

time: 484 ms


# STS Benchmark

Download the STS Benchmark Dataset from: http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark.
Some of the lines may be skipped due to errors.

In [17]:
file_path = "data/stsbenchmark/sts-dev.csv"

p = pathlib.Path(file_path)

if not p.exists():
    raise FileNotFoundError("Directory does not exist.")

sts_data = pd.read_csv(file_path, sep="\t", error_bad_lines=False, header=None)
sts_data = sts_data[[5,6,4]]
sts_data.columns = ["A", "B", "sim"]
sts_data.dropna(inplace=True)
sts_data.A = (sts_data.A.apply(normalize_text))
sts_data.B = (sts_data.B.apply(normalize_text))

sents_a = sts_data.A.values.tolist()
sents_b = sts_data.B.values.tolist()
assert len(sents_a) == len(sents_b)

b'Skipping line 1041: expected 7 fields, saw 8\nSkipping line 1065: expected 7 fields, saw 8\nSkipping line 1082: expected 7 fields, saw 8\nSkipping line 1136: expected 7 fields, saw 8\nSkipping line 1149: expected 7 fields, saw 8\nSkipping line 1449: expected 7 fields, saw 9\nSkipping line 1450: expected 7 fields, saw 9\nSkipping line 1451: expected 7 fields, saw 9\nSkipping line 1452: expected 7 fields, saw 9\nSkipping line 1453: expected 7 fields, saw 9\nSkipping line 1454: expected 7 fields, saw 9\nSkipping line 1455: expected 7 fields, saw 9\nSkipping line 1456: expected 7 fields, saw 9\nSkipping line 1457: expected 7 fields, saw 9\nSkipping line 1458: expected 7 fields, saw 9\nSkipping line 1459: expected 7 fields, saw 9\nSkipping line 1460: expected 7 fields, saw 9\nSkipping line 1461: expected 7 fields, saw 9\nSkipping line 1462: expected 7 fields, saw 9\nSkipping line 1463: expected 7 fields, saw 9\nSkipping line 1464: expected 7 fields, saw 9\nSkipping line 1465: expected 7 f

time: 364 ms


In [18]:
cbow_vecs_a = cbow_model.train(sents_a)
cbow_vecs_b = cbow_model.train(sents_b)
cbow_model.normalize(cbow_vecs_a)
cbow_model.normalize(cbow_vecs_b)

sif_vecs_a = sif_model.train(sents_a)
sif_vecs_b = sif_model.train(sents_b)
sif_model.normalize(sif_vecs_a)
sif_model.normalize(sif_vecs_b)

results = pd.DataFrame()
results["STS"] = sts_data.sim

def pearson_correlation(mat_a, mat_b):
    assert mat_a.shape == mat_b.shape
    results = []
    for i in range(len(mat_a)):
        results.append(mat_a[i].dot(mat_b[i]))
    return results

results["CBOW"] = pearson_correlation(cbow_vecs_a, cbow_vecs_b)
results["SIF"] = pearson_correlation(sif_vecs_a, sif_vecs_b)

2019-06-17 09:09:08,994 : INFO : estimated required memory for 1441 sentences and 300 dimensions: 1 MB (0 GB)
2019-06-17 09:09:09,022 : INFO : finished computing sentence embeddings of 1441 effective sentences with 13876 effective words
2019-06-17 09:09:09,023 : INFO : estimated required memory for 1441 sentences and 300 dimensions: 1 MB (0 GB)
2019-06-17 09:09:09,045 : INFO : finished computing sentence embeddings of 1441 effective sentences with 13681 effective words
2019-06-17 09:09:09,046 : INFO : computing L2-norms of sentence embeddings
2019-06-17 09:09:09,062 : INFO : computing L2-norms of sentence embeddings
2019-06-17 09:09:09,077 : INFO : estimated required memory for 1441 sentences and 300 dimensions: 1 MB (0 GB)
2019-06-17 09:09:09,092 : INFO : finished computing sentence embeddings of 1441 effective sentences with 13876 effective words
2019-06-17 09:09:09,093 : INFO : computing 1 principal components
2019-06-17 09:09:09,101 : INFO : removing 1 principal components
2019-06-

time: 187 ms


In [19]:
results = results.corr()
now = datetime.now()
date_time = now.strftime("%m-%d-%Y_%H-%M-%S")
results.to_excel("excel/STScomp_"+date_time+".xlsx")

time: 12.2 ms


In [20]:
results

Unnamed: 0,STS,CBOW,SIF
STS,1.0,0.722721,0.775961
CBOW,0.722721,1.0,0.918188
SIF,0.775961,0.918188,1.0


time: 13.1 ms


# RBF Network

In [21]:
def load_sts(file_path):
    p = pathlib.Path(file_path)

    if not p.exists():
        raise FileNotFoundError("Directory does not exist.")
        
    sts_data = pd.read_csv(file_path, sep="\t", error_bad_lines=False, header=None)
    sts_data = sts_data[[5,6,4]]
    sts_data.columns = ["A", "B", "sim"]
    sts_data.dropna(inplace=True)
    sts_data.A = (sts_data.A.apply(normalize_text))
    sts_data.B = (sts_data.B.apply(normalize_text))
    
    sents_a = sts_data.A.values.tolist()
    sents_b = sts_data.B.values.tolist()
    assert len(sents_a) == len(sents_b)
    
    sims = sts_data.sim.values.tolist()
    sims /= np.max(sims)
    return sents_a, sents_b, sims

time: 2.42 ms


In [60]:
train_a, train_b, y_train = load_sts("data/stsbenchmark/sts-train.csv")
val_a, val_b, y_val = load_sts("data/stsbenchmark/sts-dev.csv")
test_a, test_b, y_test = load_sts("data/stsbenchmark/sts-test.csv")

b'Skipping line 2508: expected 7 fields, saw 8\nSkipping line 2649: expected 7 fields, saw 8\nSkipping line 2726: expected 7 fields, saw 8\nSkipping line 3070: expected 7 fields, saw 8\nSkipping line 3392: expected 7 fields, saw 8\nSkipping line 5515: expected 7 fields, saw 9\nSkipping line 5516: expected 7 fields, saw 9\nSkipping line 5517: expected 7 fields, saw 9\nSkipping line 5518: expected 7 fields, saw 9\nSkipping line 5519: expected 7 fields, saw 9\nSkipping line 5520: expected 7 fields, saw 9\nSkipping line 5521: expected 7 fields, saw 9\nSkipping line 5522: expected 7 fields, saw 9\nSkipping line 5523: expected 7 fields, saw 9\nSkipping line 5524: expected 7 fields, saw 9\nSkipping line 5525: expected 7 fields, saw 9\nSkipping line 5526: expected 7 fields, saw 9\nSkipping line 5527: expected 7 fields, saw 9\nSkipping line 5528: expected 7 fields, saw 9\nSkipping line 5529: expected 7 fields, saw 9\nSkipping line 5530: expected 7 fields, saw 9\nSkipping line 5531: expected 7 f

ParserError: Error tokenizing data. C error: EOF inside string starting at row 1118

time: 1.32 s


In [115]:
cbow_model = Sentence2Vec(model, alpha=1e-3, components=4, no_frequency=True)

I0617 09:18:51.062963 4781397440 sentence2vec.py:239] pre-computing SIF weights
I0617 09:18:51.064175 4781397440 sentence2vec.py:242] no frequency mode: using wordfreq for estimation (lang=en)


time: 4.03 s


In [116]:
def sqd(a,b):
    # Elementwise squared difference
    return np.power(a-b,2)

def compute_diff_features(x1, x2, model):
    x1_v = model.train(x1)
    x2_v = model.train(x2)
    
    model.normalize(x1_v)
    model.normalize(x2_v)
    
    output = np.zeros_like(x1_v)
    
    for i in range(len(x1_v)):
        output[i] = sqd(x1_v[i],x2_v[i])
    
    return output

time: 1.5 ms


In [117]:
x_train = compute_diff_features(train_a, train_b, cbow_model)
x_val = compute_diff_features(val_a, val_b, cbow_model)
#x_test = compute_diff_features(test_a, test_b, cbow_model)

I0617 09:18:55.110586 4781397440 sentence2vec.py:300] estimated required memory for 5506 sentences and 300 dimensions: 6 MB (0 GB)
I0617 09:18:55.168913 4781397440 sentence2vec.py:365] finished computing sentence embeddings of 5506 effective sentences with 45980 effective words
I0617 09:18:55.169651 4781397440 sentence2vec.py:181] computing 4 principal components
I0617 09:18:55.199650 4781397440 sentence2vec.py:208] removing 4 principal components
I0617 09:18:55.206314 4781397440 sentence2vec.py:300] estimated required memory for 5506 sentences and 300 dimensions: 6 MB (0 GB)
I0617 09:18:55.269970 4781397440 sentence2vec.py:365] finished computing sentence embeddings of 5506 effective sentences with 45856 effective words
I0617 09:18:55.270796 4781397440 sentence2vec.py:181] computing 4 principal components
I0617 09:18:55.297655 4781397440 sentence2vec.py:208] removing 4 principal components
I0617 09:18:55.304008 4781397440 sentence2vec.py:326] computing L2-norms of sentence embeddings


time: 542 ms


In [118]:
df_test= pd.DataFrame()

df_test["t"] = y_train
df_test["sqd"] = -x_train.sum(axis=-1)

df_test.corr()

Unnamed: 0,t,sqd
t,1.0,0.738595
sqd,0.738595,1.0


time: 10.7 ms


In [119]:
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr as prs
reg = LinearRegression().fit(x_train, y_train)
y_lr_pred = reg.predict(x_val)
prs(y_lr_pred, y_val)

(0.7719858648235562, 1.8016361057569245e-285)

time: 41.4 ms


In [120]:
epochs= 1000
neurons = 100
batch_size = 128

time: 488 µs


In [121]:
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Lambda, Input, Reshape, Multiply
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.constraints import non_neg
from tensorflow.keras import regularizers


def get_model():
    ann_pred = Sequential()
    ann_pred.add(Dense(units=neurons, input_dim = x_train.shape[1]))
    ann_pred.add(BatchNormalization())
    ann_pred.add(Activation('relu'))
    ann_pred.add(Dropout(0.5))

    ann_pred.add(Dense(units=neurons))
    ann_pred.add(BatchNormalization())
    ann_pred.add(Activation('relu'))
    ann_pred.add(Dropout(0.5))

    ann_pred.add(Dense(units=1, activation='sigmoid'))
    
    opt = optimizers.Adagrad(lr=0.001)
    
    ann_pred.compile(loss='binary_crossentropy',optimizer=opt, metrics=['binary_crossentropy'])
    return ann_pred

ann = get_model()

mon = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto') #val_acc

time: 260 ms


In [122]:
%%time
ann.fit(x=x_train, y=y_train,epochs=epochs, validation_data=(x_val, y_val), callbacks=[mon], batch_size=batch_size, verbose=0)

CPU times: user 7.82 s, sys: 786 ms, total: 8.61 s
Wall time: 6.05 s


<tensorflow.python.keras.callbacks.History at 0x1b68683b70>

time: 6.05 s


In [123]:
y_ann_pred = ann.predict(x_val, batch_size=128).reshape(-1,)

time: 194 ms


In [124]:
prs(y_ann_pred, y_val)

(0.7582680257227848, 1.231366100178887e-269)

time: 2.28 ms


In [125]:
epsilon = 1e-8

def elu1p(x):
    return (K.elu(x) + 1)+epsilon

inputs = Input(shape=(x_train.shape[1],))

weighted_input=False
sigma_regularizer= 1e-2 #Eta
weight_sparsity = 1e-2 #Gamma

numerator = Dense(1, use_bias=False, weights=[np.ones(x_train.shape[1]).reshape(-1,1)],
                  trainable=weighted_input, 
                  #kernel_constraint=non_neg(), 
                  #kernel_regularizer=regularizers.l1(weight_sparsity), 
                  name="numerator")(inputs)

h1 = Dense(neurons)(inputs) 
h1act = Activation("relu")(h1)

sigma = Dense(1, activation=elu1p, name="sigma")(h1act)

denominator = Lambda(lambda x: (2*(x ** 2))**-1)(sigma)
div = Multiply()([numerator, denominator])

out = Lambda(lambda x: K.exp(x*-1))(div)

rbfo_pred = Model(inputs=inputs, outputs=out)

opt = optimizers.Adagrad(lr=0.001)

#opt = optimizers.RMSprop(lr=0.001, rho=0.9, decay=0.0, clipvalue=0.5)

rbfo_pred.compile(loss=['binary_crossentropy'],optimizer=opt,metrics=['binary_crossentropy'])

#rbfo_sigma = Model(inputs=rbfo_pred.input, outputs=rbfo_pred.get_layer("sigma").output)

#mon = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto') #val_acc

#if weighted_input:
#    rbfo_name = "rbfo_pred_"+str(x_train.shape[1])+"_"+str(sigma_regularizer)+"sig_"+str(weight_sparsity)+"_wg.h5"
#else:
#    rbfo_name = "rbfo_pred_"+str(x_train.shape[1])+"_"+str(sigma_regularizer)+"sig.h5"

time: 105 ms


In [126]:
%%time
rbfo_pred.fit(x=x_train, y=y_train,epochs=epochs, validation_data=(x_val, y_val), callbacks=[mon], batch_size=batch_size, verbose=0)

CPU times: user 4.56 s, sys: 456 ms, total: 5.01 s
Wall time: 3.26 s


<tensorflow.python.keras.callbacks.History at 0x1b627c9ef0>

time: 3.26 s


In [127]:
y_rbfo_pred = rbfo_pred.predict(x_val, batch_size=128).reshape(-1,)
prs(y_rbfo_pred, y_val)

(0.7628704359711005, 7.905953869296321e-275)

time: 56.1 ms
