In [1]:
import numpy as np

In [2]:
X_train_transformed = np.load('../data/cleanedTrainData.npy')
X_extra_train_transformed = np.load('../data/cleanedExtraTrainData.npy')
X_train = np.concatenate([X_train_transformed, X_extra_train_transformed])

len(X_train)

74998

In [3]:
import re

def clean(corpus):
    transformed = []
    for document in corpus:
        document = re.sub('\\\|\.|"', '', document)
        document = ' '.join([x for x in document.split(' ') if len(x) > 2])
        transformed.append(document)
    return transformed

In [4]:
X_train[1]

'\\the classic war worlds\\ timothy hines entertaining film obviously go great effort length faithfully recreate h. g. wells classic book mr. hines succeed watch film appreciate fact standard predictable hollywood fare come year e.g. spielberg version tom cruise slight resemblance book obviously look different thing movie envision amateur \\"critics\\ look criticize rate movie important basis like entertain people agree \\"critics\\ enjoy effort mr. hines faithful h.g. wells classic novel find entertaining easy overlook \\"critics\\ perceive shortcoming'

In [5]:
clean([X_train[1]])

['the classic war worlds timothy hines entertaining film obviously great effort length faithfully recreate wells classic book hines succeed watch film appreciate fact standard predictable hollywood fare come year spielberg version tom cruise slight resemblance book obviously look different thing movie envision amateur critics look criticize rate movie important basis like entertain people agree critics enjoy effort hines faithful wells classic novel find entertaining easy overlook critics perceive shortcoming']

In [6]:
X_train = clean(X_train)
len(X_train)

74998

In [7]:
def split_sentences(corpus):
    sentences = []
    for document in corpus:
        words = document.split()
        sentences.append(words)
    return sentences

In [8]:
sentences = split_sentences(X_train)
len(sentences[0])

170

In [9]:
dim_size = 300
max_distance = 10
min_frequency = 40
down_sampling = 1e-3

In [10]:
from gensim.models import Word2Vec

word2vec = Word2Vec(
    sentences=sentences,
    size=dim_size,
    window=max_distance,
    min_count=min_frequency,
    sample=down_sampling,
    workers=4
)

word2vec.init_sims(replace=True)

In [11]:
word2vec.wv.vectors.shape

(13354, 300)

In [12]:
word2vec.wv.most_similar('hopkins')

[('anthony', 0.800419270992279),
 ('gosling', 0.7770460247993469),
 ('miriam', 0.7453286647796631),
 ('quinn', 0.7100683450698853),
 ('perkins', 0.6624230742454529),
 ('mcdormand', 0.6492488384246826),
 ('raymond', 0.6442375779151917),
 ('alan', 0.640479326248169),
 ('gielgud', 0.6371688842773438),
 ('forsythe', 0.6353374719619751)]

In [13]:
def doc_to_vec(document):
    vectors = []
    words = document.split()
    for word in words:
        if word in word2vec.wv:
            vectors.append(word2vec.wv[word])
    return np.array(vectors)

In [14]:
random_idx = np.random.randint(low=0, high=len(X_train_transformed) - 1)
random_review = X_train[random_idx]
random_review

'film disappoint acting atrocious unbelievable actor story incredibly obvious group independent actor stage passion play turn start live life character play watch lot movie lately thank netflix watch way long time feel need end know end story modernization gospels sacrilegious enlightening speak raise christian church say raise live increasingly christian culture curious jesus modernization religion well bad see mel gibson version guess like like conservative wish well film lot review praise arcand direction especially cinematography like film prudish preachy believe character personae involve screen life play play dated think carry weight real time production point need work strong development character plot support philosophical theological question film like musical choice obvious unoriginal example come easily mind firstly reenactment parable jesus drive money lender temple lead actor fall woman play magdalene model dancer enrage debase audition commercial wicked producer plenty pa

In [15]:
def classify_review(review, debug=False):
    review_vec = np.sum(doc_to_vec(review), axis=0) / len(review)

    pos_corr = np.dot(word2vec.wv['good'], review_vec)
    neg_corr = np.dot(word2vec.wv['bad'],  review_vec)
    
    if debug == True:
        print('Positive = {:.3f} / Negative = {:.3f}'.format(pos_corr, neg_corr))
    
    if pos_corr > neg_corr:
        return 1
    else:
        return 0

In [16]:
if classify_review(random_review, debug=True) == 1:
    print(':)')
else:
    print(':/')

Positive = 0.009 / Negative = 0.010
:/


In [17]:
import pandas as pd
train_csv = pd.read_csv('../data/labeledTrainData.tsv', sep='\t')

In [18]:
y_train = train_csv['sentiment'].astype(np.int)
len(y_train)

25000

In [19]:
y_pred = [classify_review(review) for review in X_train[:25000]]
len(y_pred)

25000

In [20]:
y_train[random_idx], y_pred[random_idx]

(0, 0)

In [21]:
from sklearn.metrics import accuracy_score

# gives only 78%, worse than plain bag of words
accuracy_score(y_train, y_pred)

0.77468

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.69      0.75     12500
           1       0.73      0.86      0.79     12500

    accuracy                           0.77     25000
   macro avg       0.78      0.77      0.77     25000
weighted avg       0.78      0.77      0.77     25000



In [23]:
y_mask = y_train != y_pred

X_train_errors = np.array(X_train[:25000])
X_train_errors = X_train_errors[y_mask == True]

len(X_train_errors)

5633

In [24]:
X_train_errors[10]

"charming boy mother middle town cat death soon follow sum i'll admit little freak cat see movie seriousness spite numerous thing wrong film believe plenty overall enjoyable viewing experience character like caricature basis instinct rely fear greed pride lust anger motivate people argue fail actuality serve telling story supernatural premise fact stephen king screenplay(not specific king nicely support interesting work makeup suitable music absolute gem film doubt alice krige play mary brady otherworldly mother king manage simple story outsider people little different(okay lot case try fit twist campy little horror gem collection horror fan"

In [25]:
classify_review(X_train_errors[10], debug=True)

Positive = 0.010 / Negative = 0.005


1