In [1]:
import numpy as np

In [2]:
X_train_transformed = np.load('../data/cleanedTrainData.npy')
X_extra_train_transformed = np.load('../data/cleanedExtraTrainData.npy')
X_train = np.concatenate([X_train_transformed, X_extra_train_transformed])

len(X_train)

74998

In [3]:
import re

def clean(corpus):
    transformed = []
    for document in corpus:
        document = document.replace('\\', '')
        document = document.replace('.', '')
        document = document.replace('"', '')
        document = ' '.join([x for x in document.split(' ') if len(x) > 2])
        transformed.append(document)
    return transformed

In [4]:
X_train[1]

'\\the classic war worlds\\ timothy hines entertaining film obviously go great effort length faithfully recreate h. g. wells classic book mr. hines succeed watch film appreciate fact standard predictable hollywood fare come year e.g. spielberg version tom cruise slight resemblance book obviously look different thing movie envision amateur \\"critics\\ look criticize rate movie important basis like entertain people agree \\"critics\\ enjoy effort mr. hines faithful h.g. wells classic novel find entertaining easy overlook \\"critics\\ perceive shortcoming'

In [5]:
clean([X_train[1]])

['the classic war worlds timothy hines entertaining film obviously great effort length faithfully recreate wells classic book hines succeed watch film appreciate fact standard predictable hollywood fare come year spielberg version tom cruise slight resemblance book obviously look different thing movie envision amateur critics look criticize rate movie important basis like entertain people agree critics enjoy effort hines faithful wells classic novel find entertaining easy overlook critics perceive shortcoming']

In [6]:
X_train = clean(X_train)
len(X_train)

74998

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

vectorizer = CountVectorizer(min_df=3, max_features=5000)
data_vectorized = vectorizer.fit_transform(X_train)

lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', n_jobs=-1)
lda_output = lda_model.fit_transform(data_vectorized)

pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')

In [8]:
def split_sentences(corpus):
    sentences = []
    for document in corpus:
        words = document.split()
        sentences.append(words)
    return sentences

In [9]:
sentences = split_sentences(X_train)
len(sentences[0])

170

In [10]:
dim_size = 300
max_distance = 10
min_frequency = 40
down_sampling = 1e-3

In [11]:
from gensim.models import Word2Vec

word2vec = Word2Vec(
    sentences=sentences,
    size=dim_size,
    window=max_distance,
    min_count=min_frequency,
    sample=down_sampling,
    workers=4
)

word2vec.init_sims(replace=True)

  from collections import Mapping, defaultdict
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
  from collections import namedtuple, defaultdict, Iterable


In [12]:
word2vec.wv.vectors.shape

(13354, 300)

In [13]:
word2vec.wv.most_similar('hopkins')

[('anthony', 0.7974323630332947),
 ('gosling', 0.7752270698547363),
 ('quinn', 0.7109999060630798),
 ('gooding', 0.6796231269836426),
 ('miriam', 0.6753872036933899),
 ('hackman', 0.653174877166748),
 ('sloane', 0.6465252041816711),
 ('everett', 0.6416574716567993),
 ('morgan', 0.6399545669555664),
 ('coleman', 0.6371748447418213)]

In [14]:
def doc_to_vec(document):
    vectors = []
    words = document.split()
    for word in words:
        if word in word2vec.wv:
            vectors.append(word2vec.wv[word])
    return np.array(vectors)

In [15]:
random_idx = np.random.randint(low=0, high=len(X_train_transformed) - 1)
random_review = X_train[random_idx]
random_review

'sammo hung number film pedicab driver consider masterpiece agree extent film great part get incredible fantastic hong kong film combination pretty good write drama interesting sympathetic non sympathetic character genuinely funny humor truly hyper kung guarantee jaw drop familiar hong kong cinema watch film experienced enthusiast unique industry sammo max mok siu chung play pedicab driver live hong kong middle century think drive cab desperate love sammo interested local baker girl ping nina chi max day meet mysterious beautiful fennie yuen kit ying fall madly love character introduce remain pretty clear time viewer concentrate film plot film confusing bad especially hong kong soon know instance ruthless gangster family terrorize neighborhood course violent act affect protagonist premise incredibly choreograph kung fight sequence create nice drama film great cast lead good restrained painfully awful act humor sammo great eastern condors number example thank carefully write screenplay 

In [16]:
from numpy.linalg import norm

def cos_sim(v, w):
    dot_product  = np.dot(v, w)
    norm_product = norm(v) * norm(w)
    return dot_product / norm_product

In [21]:
def classify_review(review, debug=False):
    review_vec = np.sum(doc_to_vec(review), axis=0)

    pos_corr = cos_sim(word2vec.wv['good'], review_vec)
    neg_corr = cos_sim(word2vec.wv['bad'],  review_vec)
    
    if debug == True:
        print('Positive = {:.3f} / Negative = {:.3f}'.format(pos_corr, neg_corr))
    
    if pos_corr > neg_corr:
        return 1
    else:
        return 0

In [22]:
if classify_review(random_review, debug=True) == 1:
    print(':)')
else:
    print(':/')

Positive = 0.443 / Negative = 0.250
:)


In [19]:
import pandas as pd
train_csv = pd.read_csv('../data/labeledTrainData.tsv', sep='\t')

In [20]:
y_train = train_csv['sentiment'].astype(np.int)
len(y_train)

25000

In [35]:
pos = [cos_sim(np.sum(doc_to_vec(x), axis=0), word2vec.wv['good']) for x in X_train[:25000]]
neg = [cos_sim(np.sum(doc_to_vec(x), axis=0), word2vec.wv['bad'])  for x in X_train[:25000]]

X_train_cos = np.c_[pos, neg]
X_train_cos.shape

(25000, 2)

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

scores = cross_val_score(LogisticRegression(), X_train_cos, y_train, cv=5)
scores

array([0.7938, 0.7942, 0.795 , 0.7962, 0.7918])

In [42]:
log_reg = LogisticRegression()
log_reg.fit(X_train_cos, y_train)

y_pred = log_reg.predict(X_train_cos[:25000])

len(y_pred)

25000

In [43]:
y_train[random_idx], y_pred[random_idx]

(1, 1)

In [44]:
from sklearn.metrics import accuracy_score

# gives only 79%, worse than plain bag of words
accuracy_score(y_train, y_pred)

0.79432

In [45]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.79      0.79     12500
           1       0.79      0.80      0.79     12500

    accuracy                           0.79     25000
   macro avg       0.79      0.79      0.79     25000
weighted avg       0.79      0.79      0.79     25000



In [46]:
y_mask = y_train != y_pred

X_train_errors = np.array(X_train[:25000])
X_train_errors = X_train_errors[y_mask == True]

len(X_train_errors)

5142

In [47]:
X_train_errors[10]

'rent movie fall genre romance western grand canyon scenery throw expect typical wholesome romantic western forget movie pure trash romance young girl puberty middle aged man child lust leer man sicken peter fonda portray virtuous try resist attraction brooke shields character pursue relationship try shoo eventually give drive happy love couple revolt movie appeal pedophile'

In [48]:
classify_review(X_train_errors[10], debug=True)

Positive = 0.109 / Negative = -0.036


1