In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("word2vec/train.csv", index_col=0)
df.head()

Unnamed: 0,review,label
0,I think they really let the quality of the DVD...,0
1,I'm sorry but this is just awful. I have told ...,0
2,"The Japenese sense of pacing, editing and musi...",0
3,"In the '60's/'70's, David Jason was renowned f...",1
4,"""Hail The Woman"" is one of the most moving fil...",1


In [2]:
import re
import nltk
import ssl

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from itertools import chain
from collections import defaultdict
from gensim.models import Phrases

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

en_stop = list(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

def tokenize(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if
              re.match(r'[^\W\d]*$', t) and (len(t) > 2) and (t not in en_stop)]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

tokens = df['review'].apply(tokenize)

[nltk_data] Downloading package punkt to /Users/ilya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ilya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ilya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
bigram_transformer = Phrases(tokens)

In [4]:
all_words = set(chain(*bigram_transformer[tokens]))

word_count = defaultdict(lambda: 0)
word_pos_count = defaultdict(lambda: 0)

for sent, label in zip(bigram_transformer[tokens], df.label):
    sent = set(sent)
    for word in sent:
        word_count[word] += 1
        word_pos_count[word] += label
        
word_prob = {word: word_pos_count[word] / word_count[word] for word in word_count}

word_entropy = {word: -(1 - p) * np.log(1 - p) - p * np.log(p) for word, p in word_prob.items() if 0 < p < 1}
for x in word_prob:
    if word_prob[x] == 0 or word_prob[x] == 1:
        word_entropy[x] = 0

In [5]:
from gensim.models import Word2Vec

model = Word2Vec(bigram_transformer[tokens], sg=1, size=200, window=4, min_count=5, iter=20)

In [6]:
def encode(list_of_tokens):
    x = np.array([model.wv[t] * (.7 - word_entropy[t]) for t in list_of_tokens if ((t in model.wv.vocab) and (t in word_entropy))])
    if x.size == 0:
        return np.zeros((model.wv['film'].size * 1))
    return np.hstack((np.mean(x, axis=0), np.min(x, axis=0), np.max(x, axis=0)))

    
fts = np.array([encode(t) for t in bigram_transformer[tokens]])
fts.shape

(40000, 600)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(fts, df.label.values,
                                                    test_size=0.2, shuffle=True)

In [8]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=2000, C=50, verbose=1, n_jobs=4).fit(X_train[:, :], y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:  4.3min finished


In [9]:
from sklearn.metrics import classification_report

predicts = clf.predict(X_train).round()
print('Train\n', classification_report(y_train, predicts, digits=5))

predicts = clf.predict(X_test).round()
print('Test\n', classification_report(y_test, predicts, digits=5))

Train
               precision    recall  f1-score   support

           0    0.91435   0.90628   0.91030     16091
           1    0.90605   0.91414   0.91008     15909

    accuracy                        0.91019     32000
   macro avg    0.91020   0.91021   0.91019     32000
weighted avg    0.91022   0.91019   0.91019     32000

Test
               precision    recall  f1-score   support

           0    0.91071   0.90040   0.90553      3976
           1    0.90268   0.91277   0.90770      4024

    accuracy                        0.90663      8000
   macro avg    0.90669   0.90659   0.90661      8000
weighted avg    0.90667   0.90663   0.90662      8000



In [None]:
clf = LogisticRegression(max_iter=2000, C=50, verbose=1, n_jobs=4).fit(fts, df.label.values)

test = pd.read_csv('word2vec/test.csv', index_col=0)
test_tokens = test['review'].apply(tokenize)
test_fts = np.array([encode(t) for t in bigram_transformer[test_tokens]])

predicted = clf.predict(test_fts)

pd.DataFrame({'Predicted': predicted}).to_csv('solution.csv', index_label='Id')