# Сентимент-анализ отзывов 

In [1]:
import pandas as pd
import numpy as np
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer 
import re

import warnings
warnings.filterwarnings("ignore")

В этом проекте мы попробуем разобраться с задачей анализа тональности отзывов на примере сентимент-анализа отзывов на товары.
Мы будем использовать данные из соревнования Kaggle Inclass https://www.kaggle.com/c/simplesentiment 


Загрузим тесты для обучения и теста и посмотрим, что они из себя представляют

In [3]:
df = pd.read_csv("products_sentiment_train.tsv", sep='\t', header=None)
test = pd.read_csv("products_sentiment_test.tsv", sep='\t', index_col=0)

In [177]:
print('Обучение:', df.shape)
print('Тест:', test.shape)

Обучение: (2000, 2)
Тест: (500, 1)


In [178]:
df.head()

Unnamed: 0,0,1
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [5]:
labels = df.iloc[:,-1]

In [179]:
sum(labels)

1274

In [6]:
labels.value_counts()

1    1274
0     726
Name: 1, dtype: int64

Видим, что обучающая выборка более-менее сбалансирована.                                 
Удалим из наших текстов лишние символы и преобразуем с помощью лемматизации и стемминга.

In [187]:
lem=WordNetLemmatizer()
ps =PorterStemmer()
lem_reviews = np.array(df.iloc[:,0].values)

for i,text in enumerate(df.iloc[:,0].values):
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)  
    words_lem=[lem.lemmatize(word) for word in text.split(' ')]
    words_lem_ps = [ps.stem(word) for word in words_lem]   
    sentiment =  ' '.join(words_lem_ps)
    lem_reviews[i] = sentiment

Попробуем несколько вариантов извлечения признаков из текстов и разные линейные обучающие модели 

### CountVectorizer и Tf-idf

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

Попробуем несколько вариантов моделей с CountVectorizer и Tf-idf

In [188]:
print('sgd_count: ',cross_val_score(Pipeline([("vectorizer", CountVectorizer(min_df=5, max_df=2000, ngram_range=(1,3))),("classifier", SGDClassifier())]), 
                lem_reviews, labels, scoring = 'accuracy').mean())
print('sgd_tfidf: ',cross_val_score(Pipeline([("vectorizer", TfidfVectorizer(min_df=5, max_df=2000, ngram_range=(1,3))),("classifier", SGDClassifier())]), 
                                lem_reviews, labels).mean())

sgd_count:  0.75
sgd_tfidf:  0.7525000000000001


In [189]:
print('svc_count: ',cross_val_score(Pipeline([("vectorizer", CountVectorizer(min_df=5, max_df=2000, ngram_range=(1,3))),("classifier", LinearSVC())]), 
                lem_reviews, labels, scoring = 'accuracy').mean())
print('svc_tfidf: ',cross_val_score(Pipeline([("vectorizer", TfidfVectorizer(min_df=5, max_df=2000, ngram_range=(1,3))),("classifier", LinearSVC())]), 
                                lem_reviews, labels).mean())

svc_count:  0.7424999999999999
svc_tfidf:  0.772


In [190]:
print('logit_count: ',cross_val_score(Pipeline([("vectorizer", CountVectorizer(min_df=5, max_df=2000, ngram_range=(1,3))),("classifier", LogisticRegression())]), 
                lem_reviews, labels, scoring = 'accuracy').mean())
print('logit_tfidf: ',cross_val_score(Pipeline([("vectorizer", TfidfVectorizer(min_df=5, max_df=2000, ngram_range=(1,3))),("classifier", LogisticRegression())]), 
                                lem_reviews, labels, scoring = 'accuracy').mean())

logit_count:  0.767
logit_tfidf:  0.7629999999999999


Видно что tf-idf работает лучше со всеми моделями. В результате нескольких экспериментов получим следующий набор параметров

In [169]:
print('logit_tfidf: ',cross_val_score(Pipeline([("vectorizer", TfidfVectorizer(max_df=1000, ngram_range=(1,3))),("classifier", LogisticRegression())]), 
                                lem_reviews, labels, scoring = 'accuracy').mean())
print('svc_tfidf: ',cross_val_score(Pipeline([("vectorizer", TfidfVectorizer(max_df=1000, ngram_range=(1,3))),("classifier", LinearSVC())]), 
                                lem_reviews, labels, scoring = 'accuracy').mean())
print('sgd_tfidf: ',cross_val_score(Pipeline([("vectorizer", TfidfVectorizer(max_df=1000, ngram_range=(1,3))),("classifier", SGDClassifier())]), 
                                lem_reviews, labels, scoring = 'accuracy').mean())

logit_tfidf:  0.725
svc_tfidf:  0.7875
sgd_tfidf:  0.7889999999999999


In [182]:
#Запишем лучший вариант обработки данных
tfv = TfidfVectorizer(max_df=1000, ngram_range=(1,3))
tf_idf = tfv.fit_transform(lem_reviews)

Попробуем подобрать лучшие параметры для наших обучающих моделей

In [130]:
param = {'penalty':['l1', 'l2'], 'loss':['hinge','squared_hinge'], 'C':[1,2,3]}
clf = GridSearchCV(LinearSVC(random_state = 42), param, scoring = 'accuracy')
clf.fit(tf_idf, labels)

GridSearchCV(estimator=LinearSVC(random_state=42), param_grid={'C': [2, 3, 1]},
             scoring='accuracy')

In [46]:
print('svc_tfidf: ',cross_val_score(LinearSVC(C=2, loss='hinge',random_state = 42), tf_idf, labels, 
                                    scoring = 'accuracy').mean())

svc_tfidf:  0.7859999999999999


In [46]:
param = {'penalty':['l1', 'l2','elasticnet'], 'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 
         'alpha':[0.0001,0.001,0.01], 'n_jobs': [-1]}
clf = GridSearchCV(estimator = SGDClassifier(random_state = 42), param_grid = param, scoring = 'accuracy')
clf.fit(tf_idf, labels)

GridSearchCV(estimator=SGDClassifier(random_state=42),
             param_grid={'alpha': [0.0001, 0.001, 0.01],
                         'loss': ['hinge', 'log', 'modified_huber',
                                  'squared_hinge', 'perceptron'],
                         'n_jobs': [-1],
                         'penalty': ['l1', 'l2', 'elasticnet']},
             scoring='accuracy')

In [45]:
print('sgd_tfidf: ',cross_val_score(SGDClassifier(alpha = 0.0001, loss='hinge', n_jobs=-1, penalty='l2'), 
                                    tf_idf, labels, scoring = 'accuracy').mean())

sgd_tfidf:  0.7875


In [183]:
param = {'penalty':['l1', 'l2','elasticnet'],'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
         'C':[1,0.1,0.01], 'n_jobs': [-1]}
clf = GridSearchCV(estimator = LogisticRegression(), param_grid = param, scoring = 'accuracy')
clf.fit(tf_idf, labels)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1, 0.1, 0.01], 'n_jobs': [-1],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
                                    'saga']},
             scoring='accuracy')

In [173]:
print('logit_tfidf: ',cross_val_score(LogisticRegression(solver='saga'), 
                                    tf_idf, labels, scoring = 'accuracy').mean())

logit_tfidf:  0.6679999999999999


Можно уменьшить размер признаков и обучить нелинейные модели

In [192]:
%%time
from sklearn.decomposition import NMF, TruncatedSVD

mf = TruncatedSVD(2000)
new_df = mf.fit_transform(tf_idf)

Wall time: 1min 35s


In [193]:
print('forest: ',cross_val_score(RandomForestClassifier(300), new_df, labels, scoring = 'accuracy').mean())
print('xgboost: ',cross_val_score(XGBClassifier(300), new_df, labels, scoring = 'accuracy').mean())

forest:  0.6565000000000001
xgboost:  0.7005000000000001


И рассмотрим вариант с многослойной нейронной сеткой

In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers

In [11]:
model_net = keras.Sequential([
    keras.layers.Dense(256, activation='relu', input_shape=(46055,),kernel_regularizer=regularizers.l2(0.01)), 
    keras.layers.Dense(128, activation='relu'),   
    keras.layers.Dense(64, activation='relu'),   
    keras.layers.Dense(2, activation='softmax', activity_regularizer=regularizers.l1(0.01))
                       ])

model_net.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [13]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_net = keras.callbacks.ModelCheckpoint("mymodel1.hdf5", monitor='val_accuracy', save_best_only=True, 
                                                 mode='max', period='epoch')
history_net = model_net.fit(tf_idf.toarray(), df.iloc[:,-1], epochs=20, batch_size=16, validation_split=0.2, 
                            callbacks = [checkpoint_net])


Train on 1600 samples, validate on 400 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [53]:
model_net.load_weights('mymodel1.hdf5')

Обработаем нашу тестовую выборку и сделаем первую отсылку лучшей модели на проверку.

In [None]:
lem=WordNetLemmatizer()
lem_test = test.iloc[:,0].values
for i,text in enumerate(test.iloc[:,0].values):
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    words_lem=[lem.lemmatize(word) for word in text.split(' ')]
    words_lem_ps = [ps.stem(word) for word in words_lem]  
    sentiment =  ' '.join(words_lem_ps)
    lem_test[i] = sentiment
    
tf_idf_test = tfv.transform(lem_test)
new_test = mf.fit_transform(lem_test)

Получаем результат 0.778.                                      
Попробуем взять несколько моделей и проголовать большинством

In [54]:
logit = LogisticRegression(solver='saga')
svc = LinearSVC(C=2, loss='hinge',random_state = 42)
sgd = SGDClassifier(alpha = 0.0001, loss='hinge', n_jobs=-1, penalty='l2')
xgboost = XGBClassifier(300)

logit.fit(tf_idf, labels)
svc.fit(tf_idf, labels)
sgd.fit(tf_idf, labels)
xgboost.fit(new_df, labels)

sgd_pred = sgd.predict(tf_idf_test)
logit_pred = logit.predict(tf_idf_test)
svc_pred = svc.predict(tf_idf_test)
xgb_pred = xgboost.predict(new_test)
net = model_net.predict(new_test)
net_pred = [0 if i< 0.5 else 1 for i in net[:,1]]

In [84]:
all_pred = pd.DataFrame()

all_pred['sgd'] = sgd_pred
all_pred['logit'] = logit_pred
all_pred['svc'] = svc_pred
all_pred['net'] = net_pred
all_pred['xgb'] = xgb_pred

all_model = all_pred.sum(axis=1)
answer = [0 if i < 3 else 1 for i in all_model]

In [88]:
example1 = pd.read_csv("products_sentiment_sample_submission.csv")
example1['y'] = answer
example1.to_csv('submission_ton.csv', index=False)

Получаем результат 0.791

### Word2Vec и сверточная нейронная сеть

In [199]:
import gensim
from gensim.models import Word2Vec

In [195]:
lem=WordNetLemmatizer()
ps =PorterStemmer()
lem_reviews_cnn = np.array(df.iloc[:,0].values)

for i,text in enumerate(df.iloc[:,0].values):
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    words_lem=[lem.lemmatize(word) for word in text.split(' ')]
    words_lem_ps = [ps.stem(word) for word in words_lem]   
#     sentiment =  ' '.join(words_lem_ps)
    lem_reviews_cnn[i] = words_lem_ps

In [197]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Высота матрицы (максимальное количество слов в отзыве)
SENTENCE_LENGTH = 79
# Размер словаря
NUM = 500000

def get_sequences(tokenizer, x):
    sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequences, maxlen=SENTENCE_LENGTH)

# Cоздаем и обучаем токенизатор
tokenizer = Tokenizer(num_words=NUM)
tokenizer.fit_on_texts(lem_reviews_cnn)

# Отображаем каждый текст в массив идентификаторов токенов
x_train_seq = get_sequences(tokenizer, lem_reviews_cnn)


In [201]:
model_vec = Word2Vec(size=300, window=5, min_count=3)
model_vec.build_vocab(lem_reviews_cnn)
model_vec.train(lem_reviews_cnn, total_examples=model.corpus_count, epochs=30, report_delay=1)

(704206, 1115580)

In [20]:
DIM = model.vector_size 
# Инициализируем матрицу embedding слоя нулями
embedding_matrix = np.zeros((NUM, DIM))
# Добавляем NUM=100000 наиболее часто встречающихся слов из обучающей выборки в embedding слой
for word, i in tokenizer.word_index.items():
    if i >= NUM:
        break
    if word in model.wv.vocab.keys():
        embedding_matrix[i] = model.wv[word]

In [17]:
tweet_input = keras.layers.Input(shape=(SENTENCE_LENGTH,), dtype='int32')
tweet_encoder = keras.layers.Embedding(NUM, DIM, input_length=SENTENCE_LENGTH,
                          weights=[embedding_matrix], trainable=True)(tweet_input)

In [18]:
branches = []
# Добавляем dropout-регуляризацию
x = keras.layers.Dropout(0.2)(tweet_encoder)

for size, filters_count in [(2, 10), (3, 10), (4, 10), (5, 10)]:
    for i in range(filters_count):
        # Добавляем слой свертки
        branch = keras.layers.Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(x)
        # Добавляем слой субдискретизации
        branch = keras.layers.GlobalMaxPooling1D()(branch)
        branches.append(branch)
# Конкатенируем карты признаков
x = keras.layers.concatenate(branches, axis=1)
# Добавляем dropout-регуляризацию
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.Dense(30, activation='relu')(x)
x = keras.layers.Dense(1)(x)
output = keras.layers.Activation('sigmoid')(x)

model_cnn = keras.models.Model(inputs=[tweet_input], outputs=[output])

In [19]:
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 300, 300)     150000000   input_1[0][0]                    
__________________________________________________________________________________________________
dropout (Dropout)               (None, 300, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 299, 1)       601         dropout[0][0]                    
______________________________________________________________________________________________

In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = keras.callbacks.ModelCheckpoint("mymodel.hdf5", monitor='val_accuracy', save_best_only=True, 
                                             mode='max', period='epoch')
history = model_cnn.fit(x_train_seq, labels, batch_size=32, epochs=10, validation_split=0.2, callbacks = [checkpoint])

Train on 1600 samples, validate on 400 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [60]:
model_cnn.load_weights('mymodel.hdf5')

Посмотрим что получается на тесте

In [61]:
lem=WordNetLemmatizer()
lem_test_cnn = test.iloc[:,0].values
for i,text in enumerate(test.iloc[:,0].values):
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    words_lem=[lem.lemmatize(word) for word in text.split(' ')]
    words_lem_ps = [ps.stem(word) for word in words_lem]  
#     sentiment =  ' '.join(words_lem_ps)
    lem_test_cnn[i] = words_lem_ps
    
x_test_seq = get_sequences(tokenizer, lem_test_cnn)

predicted = model_cnn.predict(x_test_seq)
cnn_pred = [0 if i < 0.5 else 1 for i in predicted]

Получили результат 0.7822

Обучим несколько моделей на новом варианте обработаных данных

In [138]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

vec_text=[]
for text in lem_reviews_cnn:
    vec_word=[]
    for word in text:
        if word in w2v:
            vec_word.append(w2v[word])
    mean_vec_text = np.sum(vec_word,axis=0)
    vec_text.append(mean_vec_text)


In [142]:
print('svc_tfidf: ',cross_val_score(RandomForestClassifier(300), vec_text, 
                                    df.iloc[:,-1], scoring = 'accuracy').mean())

svc_tfidf:  0.7070000000000001


In [141]:
print('sgd_tfidf: ',cross_val_score(SGDClassifier(alpha = 0.0001, n_jobs=-1, penalty='l2'), 
                                    vec_text, df.iloc[:,-1], scoring = 'accuracy').mean())

sgd_tfidf:  0.7015


Результат получился хуже, чем был.                      
Но если добавить нашу новую модель сверточной сети к тем что уже были получены раньше, то получим следующие результаты на тесте:

sgd + cnn + net + svc + xgb = 0.813                          
cnn + net + xgb = 0.780                            
xgb + cnn + svc = 0.815

### BERT

In [17]:
import torch
from pytorch_transformers import *

In [18]:
model_class, tokenizer_class, pretrained_weights = (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased')

# Загрузка предобученной модели/токенизатора 
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [9]:
#преобразуем каждый отзыв в список идентификаторов.
tokenized = df.iloc[:,0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
#добавим в конец нули, чтобы выровнять по длине отзывы
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
# создадим входной вектор из матрицы токенов и передадим его в DistilBERT
input_ids = torch.tensor(np.array(padded)).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(input_ids)

In [12]:
#Разрежем выход для первой позиции во всех последовательностях, возьмем все выходы скрытых нейронок 
features = last_hidden_states[0][:,0,:].numpy()

Еще раз подберем параметры обучающих моделей

In [None]:
param = {'penalty':['l1', 'l2','elasticnet'], 'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 
         'alpha':[0.0001,0.001,0.01], 'n_jobs': [-1]}
clf = GridSearchCV(estimator = SGDClassifier(random_state = 42), param_grid = param, scoring = 'accuracy')
clf.fit(features, labels)
best_sgd = clf.best_params_

In [None]:
param = {'penalty':['l1', 'l2','elasticnet'],'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
         'C':[1,0.1,0.01], 'n_jobs': [-1]}
clf = GridSearchCV(estimator = LogisticRegression(), param_grid = param, scoring = 'accuracy')
clf.fit(features, labels)
best_logit = clf.best_params_

In [None]:
param = {'penalty':['l1', 'l2'], 'loss':['hinge','squared_hinge'], 'C':[1,2,3]}
clf = GridSearchCV(LinearSVC(random_state = 42), param, scoring = 'accuracy')
clf.fit(features, labels)
best_svc = clf.best_params_

In [49]:
print('svc_bert: ',cross_val_score(LinearSVC(C=2, loss='hinge',random_state = 42), features, labels, 
                                   scoring = 'accuracy').mean())

print('sgd_bert: ',cross_val_score(SGDClassifier(alpha = 0.0001, loss = 'modified_huber', n_jobs=-1, penalty='l1'), 
                                    features, labels, scoring = 'accuracy').mean())

print('logit_bert: ',cross_val_score(LogisticRegression(C=1, solver='liblinear'), 
                                    features, labels, scoring = 'accuracy').mean())

print('xgboost_bert: ',cross_val_score(XGBClassifier(300), 
                                    features, labels, scoring = 'accuracy').mean())

print('forest_bert: ',cross_val_score(RandomForestClassifier(300), 
                                    features, labels, scoring = 'accuracy').mean())

svc_bert:  0.834
sgd_bert:  0.8240000000000001
logit_bert:  0.8230000000000001
xgboost_bert:  0.772
forest_bert:  0.748


Получаем хороший результат 

In [85]:
model_net = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(768,)),  
    keras.layers.Dense(64, activation='relu'),   
    keras.layers.Dropout(0.2),
    keras.layers.Dense(2, activation='softmax')
                       ])


model_net.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [86]:
from tensorflow.keras.callbacks import ModelCheckpoint                    

checkpoint_net = keras.callbacks.ModelCheckpoint("mymodel1.hdf5", monitor='val_accuracy', save_best_only=True, 
                                                 mode='max', save_freq='epoch')
history_net= model_net.fit(features, labels, epochs=50, batch_size=16, validation_split=0.2, callbacks=[checkpoint_net])


Train on 1600 samples, validate on 400 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [51]:
model_net.load_weights('mymodel1.hdf5')

(2000, 768)

In [109]:
#обработаем тестовые данные
tokenized_test = test.text.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
max_len = 0
for i in tokenized_test.values:
    if len(i) > max_len:
        max_len = len(i)

padded_test = np.array([i + [0]*(max_len-len(i)) for i in tokenized_test.values])

input_ids = torch.tensor(np.array(padded_test)).to(torch.int64)

with torch.no_grad():
    last_hidden_states = model(input_ids)
    
features_test = last_hidden_states[0][:,0,:].numpy()

svc на тесте выдал результат 0,837.                               
Попробуем также объединить модели

In [13]:
logit = LogisticRegression(C=1, solver='liblinear')
svc = LinearSVC(C=2, loss='hinge',random_state = 42)
sgd = SGDClassifier(alpha = 0.0001, loss = 'modified_huber', n_jobs=-1, penalty='l1')
xgboost = XGBClassifier(300)


In [16]:
# logit.fit(features, labels)
svc.fit(features, labels)
# sgd.fit(features, labels)
# xgboost.fit(features, labels)
# model_net.load_weights('mymodel1.hdf5')

LinearSVC(C=2, loss='hinge', random_state=42)

In [122]:
sgd_pred = sgd.predict(features_test)
logit_pred = logit.predict(features_test)
svc_pred = svc.predict(features_test)
net = model_net.predict(features_test)
net_pred = [0 if i< 0.5 else 1 for i in net[:,1]]
xgb_pred = xgboost.predict(features_test)

In [172]:
all_pred = pd.DataFrame()

all_pred['sgd'] = sgd_pred
all_pred['logit'] = logit_pred
all_pred['svc'] = svc_pred
all_pred['net'] = net_pred
all_pred['xgb'] = xgb_pred

all_model = all_pred.sum(axis=1)
answer = [0 if i < 3 else 1 for i in all_model]

In [134]:
example1 = pd.read_csv("products_sentiment_sample_submission.csv")
example1['y'] = answer
example1.to_csv('submission_ton.csv', index=False)

Как итог получаем результат 0,849 и 15 место в соревновании