## Импортирование библеотек

In [None]:
import pandas as pd
import numpy as np
import random
from bs4 import BeautifulSoup
import re

from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec


## Скачивание данных

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/MyDrive/IMDB_Dataset.csv", engine="python", error_bad_lines=False)
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Очистка данных
Произведем базовую очистку данных. Приводим к нижнему регистру, удаляем html тэги, удаляем пунктуацию и цифры. Для Word2Vec моделей основательная очистка данных не требуется.


In [None]:
data['review'] = data['review'].apply(lambda x: x.lower()) #Приводим к нижнему регистру
data['review'] = data['review'].apply(lambda x: BeautifulSoup(x).get_text()) # Удаляем технические слова
data['review'] = data['review'].apply(lambda x: re.sub(r"[^a-zA-Z]+", " ", x)) # Удаляем пунктуацию и цифры
data[:5]

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive


## Разделение данных на 4 файла
Необходимо для использования библеотекой GenSim

train-neg.txt: 20000 negative movie reviews from the test data

train-pos.txt: 20000 positive movie reviews from the test data

test-neg.txt: 5000 negative movie reviews from the training data

test-pos.txt: 5000 positive movie reviews from the training data

train-unsup.txt: 50000 Unlabelled movie reviews



In [None]:
f_test_neg = open("/content/drive/MyDrive/test-neg.txt", "w")
f_test_pos = open("/content/drive/MyDrive/test-pos.txt", "w")
f_train_pos = open("/content/drive/MyDrive/train-pos.txt", "w")
f_train_neg = open("/content/drive/MyDrive/train-neg.txt", "w")
f_train_unsup = open("/content/drive/MyDrive/train-unsup.txt", "w")

for n, id in enumerate(data.loc[data.sentiment == "positive"].index.to_list()):
  line = data.loc[data.sentiment == "positive"]['review'][id]+"\n"
  if n % 5 == 0:
    f_test_pos.write(line)
  else:
    f_train_pos.write(line)
  f_train_unsup.write(line)

f_test_pos.close()
f_train_pos.close()

for n, id in enumerate(data.loc[data.sentiment == "negative"].index.to_list()):
  line = data.loc[data.sentiment == "negative"]['review'][id]+"\n"
  print("2: ", n)
  if n % 5 == 0:
    f_test_neg.write(line)
  else:
    f_train_neg.write(line)
  f_train_unsup.write(line)

f_test_neg.close()
f_train_neg.close()
f_train_unsup.close()

## Пишем класс, который представляет данные в формате нужном для библеотеки GenSim

In [None]:

class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return shuffled

## Подготавливаем данные

In [None]:
test_neg = "/content/drive/MyDrive/test-neg.txt"
test_pos = "/content/drive/MyDrive/test-pos.txt"
train_neg = "/content/drive/MyDrive/train-neg.txt"
train_pos = "/content/drive/MyDrive/train-pos.txt"
train_unsup = "/content/drive/MyDrive/train-unsup.txt"

sources = {test_neg:'TEST_NEG', test_pos:'TEST_POS', train_neg:'TRAIN_NEG',
           train_pos:'TRAIN_POS', train_unsup:'TRAIN_UNS'}

sentences = LabeledLineSentence(sources)

## Создаем модель
Скользящее окно размером 10, размер выходного вектора, в котором будут кодироваться документы - 300, учитываем слова размером более 1 символа.

In [None]:
model = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7)

model.build_vocab(sentences.to_array())



## Обучаем модель
Кол-во эпох - 10, при этом документы постоянно мешаются

In [None]:
for epoch in range(10):
    model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=1)

## Смотрим созданное веторное поле

In [None]:
model.most_similar('good')

  """Entry point for launching an IPython kernel.


[('great', 0.5737320780754089),
 ('bad', 0.5302014946937561),
 ('decent', 0.5030850172042847),
 ('nice', 0.49121078848838806),
 ('terrific', 0.4088291823863983),
 ('solid', 0.4060879945755005),
 ('alright', 0.3883763551712036),
 ('well', 0.3794774115085602),
 ('excellent', 0.3667594790458679),
 ('interesting', 0.35757505893707275)]

In [None]:
model['TRAIN_NEG_0']

array([-1.34594236e-02,  1.13151461e-01,  5.34629039e-02, -8.19490552e-02,
        1.00003280e-01, -1.16313636e-01, -5.07149510e-02, -7.05573931e-02,
        1.41359165e-01, -1.42407075e-01,  4.94573638e-02, -7.41359964e-03,
        1.05124317e-01,  8.93052015e-03, -2.79366653e-02, -2.54827328e-02,
        1.58710867e-01,  8.31479579e-02, -9.24004987e-02, -1.66838199e-01,
       -1.61456857e-02,  1.06101476e-01, -8.76459479e-02,  1.72273651e-01,
       -1.04037728e-02,  1.62663668e-01,  8.64231884e-02,  2.01845057e-02,
        5.41579761e-02, -8.13006330e-03, -3.12526077e-02, -1.64003626e-01,
        2.93135531e-02,  7.59267434e-03,  4.66508884e-03, -4.85731289e-03,
       -2.69044731e-02,  1.07917212e-01,  4.31379229e-02, -1.26753569e-01,
        8.48559942e-03, -5.01939096e-02, -3.21475565e-02, -2.24962756e-01,
        1.20718628e-01,  1.02310292e-01, -3.89035195e-02,  7.80593902e-02,
       -1.46747410e-01, -1.69814199e-01, -1.54120833e-01, -7.29133263e-02,
        4.16233391e-02,  

## Сохроняем наш Doc2Vec модель, т.е векторное поле размеом 300, кодирующее наш корпус

In [None]:
model.save('/content/drive/MyDrive/imdb.d2v')

In [None]:
model = Doc2Vec.load('/content/drive/MyDrive/imdb.d2v')

## Делим данные на train и test

In [None]:
train_arrays = np.zeros((40000, 300))
train_labels = np.zeros(40000)

for i in range(20000):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[20000 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[20000 + i] = 0

print(train_arrays)
print(train_labels)

[[ 0.17461239  0.06736773  0.06424893 ...  0.08741826 -0.10874546
  -0.23376685]
 [-0.12048294 -0.077976    0.00411689 ...  0.24415332  0.01098458
   0.06421904]
 [ 0.12223712 -0.17312549  0.2217738  ...  0.31761038  0.07167377
   0.23271185]
 ...
 [ 0.05202795  0.10818143  0.00292161 ...  0.1318689   0.07976142
   0.03052988]
 [ 0.04295819 -0.10021428  0.07244997 ...  0.05251344  0.02696934
  -0.06941343]
 [-0.00197933 -0.00702499  0.02942873 ...  0.07246768 -0.04353519
  -0.0544379 ]]
[1. 1. 1. ... 0. 0. 0.]


In [None]:
test_arrays = np.zeros((10000, 300))
test_labels = np.zeros(10000)

for i in range(5000):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[5000 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[5000 + i] = 0

## Обучаем ради примера на простой модели 

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)
classifier.score(test_arrays, test_labels)

0.8481