In [61]:
import pandas as pd
import numpy as np
%matplotlib inline
from tqdm import tqdm_notebook
from tqdm import tqdm_pandas
import re
from pymystem3 import Mystem
m = Mystem()
from tqdm import tqdm_notebook, tqdm_pandas

In [62]:
df = pd.read_csv("X_train.csv")
df['alltext'] = df['comment'].astype(str) + " "+ df['commentPositive'].astype(str) + " " + df['commentNegative'].astype(str)
stopwords = ['по','это','мой','в','я','и','на','что','быть','этот','у','о','который','из','ваш','когда','кто','за','вы','руб','рубль']

In [63]:
df['rating'] = df['reting'].astype(int)
df['rating'].value_counts()

5    9211
4    2741
1    1475
3    1277
2     883
Name: rating, dtype: int64

In [80]:
sentilex = pd.read_csv('RuSentiLex2017_revised_2utf.txt',
                       names=['word','part','initial','sentiment','source','amb1','amb2'],
                       skiprows=20,skipinitialspace=True,index_col=2)
sentilex['sentiment'] = sentilex['sentiment'].map({'negative':-1,'positive':1,'neutral':0,'positive/negative':0})
vocab = sentilex.to_dict(orient='index')

In [81]:
for w in list(vocab.keys()):
    if(vocab[w]['sentiment'] != 0):
        word = vocab[w] 
        word['sentiment'] = - word['sentiment']
        vocab["не_"+w] = word

In [82]:
from pymystem3 import Mystem
m = Mystem()
regex = re.compile("[,=]")

def parts_and_lemmas(text):
    text.replace("не очень", "не")
    partlist = list()
    stemmed = m.analyze(text)
    parts = list()
    lemmas = list()
    add_ne=False
    for i in [x for x in stemmed if 'analysis' in x.keys()]:
        if len(i['analysis'])>0:
            word = regex.split(i['analysis'][0]['gr'],maxsplit=1)[0]
            parts.append(word)
            if i['analysis'][0]['lex'] not in stopwords and len(i['analysis'][0]['lex'])>1:
                if(i['analysis'][0]['lex']!='не'):
                    if(add_ne):
                        lemmas.append("не_"+i['analysis'][0]['lex'])
                        add_ne=False
                    else:
                        lemmas.append(i['analysis'][0]['lex'])
                else:
                    add_ne = True
                
    return parts,lemmas

In [83]:
pl_df = df['alltext'].apply(parts_and_lemmas)


In [90]:
df['parts']=pl_df.apply(lambda x:x[0])
df['words']=pl_df.apply(lambda x:x[1])

In [84]:
from gensim.models import Word2Vec

model = Word2Vec.load("mvideo.w2v")
#model = Word2Vec(df.alltext, size=100, window=5, min_count=1, workers=4)


In [85]:
model.corpus_count

201030

In [92]:
#model.train(df.words,total_examples=model.corpus_count,epochs=10)

5133106

In [86]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [91]:
import numpy as np
from keras.utils import np_utils
from keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed, Conv1D, MaxPooling1D, Bidirectional, \
    GlobalMaxPooling1D, Flatten, BatchNormalization
import keras
from keras.models import Sequential
from keras.optimizers import Adam
def prepare_embeddings(texts,model):
        tokenizer = Tokenizer(num_words=model.corpus_count)
        texts = [" ".join(line) for line in texts]
        tokenizer.fit_on_texts(texts=texts)
        sequences = tokenizer.texts_to_sequences(texts=texts)
        word_index = tokenizer.word_index
        data = pad_sequences(sequences, maxlen=441)
        embedding_matrix = np.zeros((len(word_index) + 1, model.vector_size+1))
        for word, i in word_index.items():
            try:
                word_vec = model[word]
                sentiment = vocab[word]['sentiment'] if word in vocab.keys() else 0.0
                embedding_vector = np.hstack((word_vec, sentiment))
            except KeyError:
                embedding_vector = None
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        return data, word_index, embedding_matrix


def create_model(word_index,embedding_matrix,vector_size):
        model = Sequential()
        model.add(Embedding(len(word_index) + 1, vector_size+1, weights=[embedding_matrix],
                            input_length=441, trainable=False))
        model.add(BatchNormalization())
        model.add(Conv1D(120, 5, activation='relu'))
        model.add(BatchNormalization())
        model.add(Conv1D(120, 5, activation='relu'))
        model.add(MaxPooling1D(pool_size=4))
        model.add(Flatten())
        model.add(Dropout(0.5))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(6, activation='softmax'))
        adam = Adam(lr=0.001, decay=0.009)
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy', 'categorical_accuracy'])
        print ('Summary')
        print (model.summary())
        return model

In [92]:
from sklearn.model_selection import train_test_split
data, word_index, embedding_matrix = prepare_embeddings(df['words'],model)
X_train, X_test, y_train, y_test = train_test_split(data, keras.utils.to_categorical(df['rating']), test_size=0.2)
cnn_model = create_model(word_index,embedding_matrix,model.vector_size)

Summary
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 441, 101)          2159380   
_________________________________________________________________
batch_normalization_19 (Batc (None, 441, 101)          404       
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 437, 120)          60720     
_________________________________________________________________
batch_normalization_20 (Batc (None, 437, 120)          480       
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 433, 120)          72120     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 108, 120)          0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 12960)             0         
__

In [93]:
import keras
from sklearn.metrics import accuracy_score, f1_score
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=2)
callbacks = [
             early_stopping,
#             metrics
            ]

cnn_model.fit(X_train, y_train, epochs=10, batch_size=128, callbacks=callbacks,
                       verbose=1, validation_data=(X_test,y_test), shuffle=True,)
print('Testing...')
scores = cnn_model.evaluate(X_test, y_test, verbose=0)
print(scores)
cnn_model.save("cnn_model.tf")


Train on 12469 samples, validate on 3118 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Testing...
[0.94375200543822657, 0.64239897366285781, 0.64239897366285781]


In [None]:
cnn_model.load_weights("cnn_model.tf")
cnn_model.fit(X_train, y_train, epochs=1, batch_size=128, callbacks=callbacks,
                       verbose=1, validation_data=(X_test,y_test), shuffle=True,)
print('Testing...')
scores = cnn_model.evaluate(X_test, y_test, verbose=0)
print(scores)