In [0]:
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv', index_col=0)
df.head()

Unnamed: 0,review,label
0,I think they really let the quality of the DVD...,0
1,I'm sorry but this is just awful. I have told ...,0
2,"The Japenese sense of pacing, editing and musi...",0
3,"In the '60's/'70's, David Jason was renowned f...",1
4,"""Hail The Woman"" is one of the most moving fil...",1


In [0]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

en_stop = list(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

def tokenize(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if
              re.match(r'[^\W\d]*$', t) and (len(t) > 2) and (t not in en_stop)]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

tokens = df['review'].apply(tokenize)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
tokens

0        [think, really, let, quality, dvd, production,...
1        [sorry, awful, told, people, film, bad, acting...
2        [japenese, sense, pacing, editing, musical, sc...
3        [david, jason, renowned, many, supporting, rol...
4        [hail, woman, one, moving, film, ever, seen, e...
                               ...                        
39995    [come, across, gem, movie, like, realize, grea...
39996    [often, way, write, comment, warn, anyone, mig...
39997    [extremely, silly, little, seen, film, slavery...
39998    [saw, movie, scary, thing, people, talking, mo...
39999    [though, film, seems, trying, market, horror, ...
Name: review, Length: 40000, dtype: object

## **Обучение модели Word2Vec**
Корпус текстов IMDB, разбитый на токены, используем для обучения Word2Vec модели. Размерность скрытого предстваления выбрана 64, ширина контекста в каждую сторону равна 3:


In [0]:
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

In [0]:
bigrams = Phrases(sentences=tokens)
trigrams = Phrases(sentences=bigrams[tokens])



In [0]:
bigrams = Phraser(bigrams)
trigrams = Phraser(trigrams)

In [0]:
model = Word2Vec(tokens, size=300, window=6, min_count=4, iter=100, sg=0, sample=1e-5, workers=4)

## **Векторное представление текста**
Мы получили векторные представления для отдельных слов (токенов). Их можно по-разному складывать в векторные представления текста целиком для решения задачи классификации (а можно и не складывать и рассматривать последовательности). В нашем примере берется средний вектор:



In [0]:
def encode(list_of_tokens):
    x = np.array([model.wv[t] for t in list_of_tokens if t in model.wv.vocab])

    return np.concatenate((np.mean(x, axis=0), np.median(x, axis=0)))

fts = np.array([encode(t) for t in tokens])
fts.shape

(40000, 600)

Итак, мы получили набор фичей (64 штуки) для каждого текта, можно переходить к моделям классификации!

**Разделение датасета**

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(fts, df.label.values,
                                                    test_size=0.2, shuffle=True)

**Модель классификации** <br>

Для примера возьмем логистическую регрессию:

In [0]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs', max_iter=3000).fit(X_train, y_train)

Посмотрим метрики:

In [0]:
from sklearn.metrics import classification_report

predicts = clf.predict(X_train)
print('Train\n', classification_report(y_train, predicts, digits=4))

predicts = clf.predict(X_test)
print('Test\n', classification_report(y_test, predicts, digits=4))

Train
               precision    recall  f1-score   support

           0     0.8945    0.8921    0.8933     16057
           1     0.8917    0.8941    0.8929     15943

    accuracy                         0.8931     32000
   macro avg     0.8931    0.8931    0.8931     32000
weighted avg     0.8931    0.8931    0.8931     32000

Test
               precision    recall  f1-score   support

           0     0.8796    0.8820    0.8808      4010
           1     0.8811    0.8787    0.8799      3990

    accuracy                         0.8804      8000
   macro avg     0.8804    0.8804    0.8804      8000
weighted avg     0.8804    0.8804    0.8804      8000



In [0]:
from sklearn.svm import SVC

clf = SVC().fit(fts, df.label.values)

In [0]:
from sklearn.metrics import classification_report

predicts = clf.predict(X_train)
print('Train\n', classification_report(y_train, predicts, digits=4))

predicts = clf.predict(X_test)
print('Test\n', classification_report(y_test, predicts, digits=4))

Train
               precision    recall  f1-score   support

           0     0.9219    0.9168    0.9194     16048
           1     0.9168    0.9219    0.9193     15952

    accuracy                         0.9193     32000
   macro avg     0.9194    0.9194    0.9193     32000
weighted avg     0.9194    0.9193    0.9193     32000

Test
               precision    recall  f1-score   support

           0     0.8995    0.8910    0.8952      4019
           1     0.8910    0.8995    0.8952      3981

    accuracy                         0.8952      8000
   macro avg     0.8953    0.8953    0.8952      8000
weighted avg     0.8953    0.8952    0.8952      8000



In [0]:
test = pd.read_csv('test.csv', index_col=0)

In [0]:


tok = test['review'].apply(tokenize)
mahmax = np.array([encode(t) for t in tok])
predicted = clf.predict(mahmax)
pd.DataFrame({'Predicted': predicted}).to_csv('/content/drive/My Drive/Colab Notebooks/solution.csv', index_label='Id')

In [0]:
import gensim.downloader as api

model_pre = api.load("glove-wiki-gigaword-300")  # load glove vectors



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
def encode1(list_of_tokens):
    x = np.array([model_pre.wv[t] for t in list_of_tokens if t in model_pre.wv.vocab])

    return np.concatenate((np.mean(x, axis=0), np.max(x, axis=0), np.median(x, axis=0)))

fts_pre = np.array([encode1(t) for t in tokens])
fts_pre.shape

  


(40000, 900)

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(fts_pre, df.label.values,
                                                    test_size=0.2, shuffle=True)

In [0]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs', max_iter=1500).fit(X_train, y_train)

In [0]:
from sklearn.metrics import classification_report

predicts = clf.predict(X_train)
print('Train\n', classification_report(y_train, predicts, digits=4))

predicts = clf.predict(X_test)
print('Test\n', classification_report(y_test, predicts, digits=4))

Train
               precision    recall  f1-score   support

           0     0.8577    0.8500    0.8538     16085
           1     0.8498    0.8575    0.8536     15915

    accuracy                         0.8537     32000
   macro avg     0.8537    0.8537    0.8537     32000
weighted avg     0.8538    0.8537    0.8537     32000

Test
               precision    recall  f1-score   support

           0     0.8361    0.8478    0.8419      3982
           1     0.8470    0.8352    0.8411      4018

    accuracy                         0.8415      8000
   macro avg     0.8416    0.8415    0.8415      8000
weighted avg     0.8416    0.8415    0.8415      8000



In [0]:
print(list(tokens)[0][:10])

['think', 'really', 'let', 'quality', 'dvd', 'production', 'get', 'away', 'rented', 'dvd']
