# Compute Sentence Embeddings Fast

In [1]:
%load_ext autotime
import sys
sys.path.append("../")

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from gensim.models import KeyedVectors
from models.sif import SIF
import pathlib
import numpy as np
from re import sub

import pandas as pd
from wordfreq import get_frequency_dict

In [2]:
# Load the pre-trained word2vec model
model = KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True)  

freq_dict = get_frequency_dict("en", wordlist='best')

for w in model.vocab:
    if w in freq_dict:
        model.vocab[w].count = int(freq_dict[w] * 2**24)
    else:
        model.vocab[w].count = 1

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


time: 1min 35s


In [3]:
data_path = "../data/reddit/"

df_output = pd.DataFrame()

p = pathlib.Path(data_path)

if not p.exists():
    raise FileNotFoundError("Directory does not exist.")

file_list=[]
for f in p.iterdir():
    if f.is_file():
        file_list.append(f)

data = pd.DataFrame()
        
for i, f in enumerate(file_list):
    df_tmp = pd.read_csv(f)
    df_tmp["label"] = i
    df_tmp = df_tmp[["title", "label"]]
    data = pd.concat([data, df_tmp])
    
min_data = np.min(np.unique(data.label.values, return_counts=True)[1])
labels = np.unique(data.label.values)

data_balanced = pd.DataFrame()

for i in labels:
    data_balanced = pd.concat([data_balanced, data[data["label"] == i].sample(n=min_data, random_state=42)])
    
data_balanced = data_balanced.sample(frac=1)
y = np.array(data_balanced.label.values.tolist())

time: 75.9 ms


In [4]:
def normalize_text(sentence):
    return [sub("[^a-zA-Z]", "", w.lower()) for w in sentence.split()] 
data_balanced["title_processed"] = (data_balanced['title'].apply(normalize_text))

corpus = data_balanced["title_processed"].values.tolist()
labels = data_balanced.label.values.tolist()

#from gensim.models import Word2Vec
#model = Word2Vec()
#model.build_vocab(corpus)

corpus = [[w for w in s if w in model.wv.vocab] for s in corpus]

time: 124 ms


  if sys.path[0] == '':


In [5]:
count_vect = CountVectorizer()
x_bow = count_vect.fit_transform([" ".join(s) for s in corpus])
x_tfidf = TfidfTransformer(use_idf=True).fit_transform(x_bow)

time: 46 ms


In [None]:
sif_model = SIF(alpha=1e-3, components=1)
x_sif = sif_model.train(model, corpus)

In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

x_data = [x_bow, x_tfidf, x_cbow]

for d in x_data:
    x_train, x_test, y_train, y_test = train_test_split(d, labels, test_size=0.33, random_state=42)
    clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True)))

In [None]:
pd.DataFrame(metrics.classification_report(y_test, y_pred, output_dict=True))