In [2]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [3]:
df_tweets = pd.read_csv('/datasets/tweets.csv')
df_tweets.head()

Unnamed: 0,text,positive
0,"@first_timee хоть я и школота, но поверь, у на...",1
1,"Да, все-таки он немного похож на него. Но мой ...",1
2,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,1
3,"RT @digger2912: ""Кто то в углу сидит и погибае...",1
4,@irina_dyshkant Вот что значит страшилка :D\nН...,1


In [4]:
tokenizer = transformers.BertTokenizer(
    vocab_file='/datasets/ds_bert/vocab.txt')

tokenized = df_tweets['text'].apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [5]:
padded.shape

(5000, 133)

In [6]:
config = transformers.BertConfig.from_json_file(
    '/datasets/ds_bert/bert_config.json')
model = transformers.BertModel.from_pretrained(
    '/datasets/ds_bert/rubert_model.bin', config=config)

In [12]:
data = pd.DataFrame(padded).join(df_tweets['positive'])
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,124,125,126,127,128,129,130,131,132,positive
0,101,168,10934,230,11147,241,1985,358,322,54198,...,0,0,0,0,0,0,0,0,0,1
1,101,886,128,888,130,1135,988,2302,10029,801,...,0,0,0,0,0,0,0,0,0,1
2,101,88488,168,252,13771,31785,247,156,1025,1006,...,0,0,0,0,0,0,0,0,0,1
3,101,88488,168,10632,37992,11295,3773,156,108,1153,...,0,0,0,0,0,0,0,0,0,1
4,101,168,10990,11579,230,238,13194,247,11218,271,...,0,0,0,0,0,0,0,0,0,1


In [31]:
data_red = data.sample(frac=400/5000, random_state=42)
data_red

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,124,125,126,127,128,129,130,131,132,positive
1501,101,168,2651,10880,230,56267,10725,168,10018,2850,...,0,0,0,0,0,0,0,0,0,1
2586,101,168,254,6267,277,1693,230,53871,5630,293,...,0,0,0,0,0,0,0,0,0,0
2653,101,168,95280,3648,54447,65749,18362,814,17751,814,...,0,0,0,0,0,0,0,0,0,0
1055,101,358,3852,3511,14091,297,323,128,14026,14285,...,0,0,0,0,0,0,0,0,0,1
705,101,322,358,927,80200,80320,2060,128,10244,846,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2753,101,168,10805,10854,11318,233,230,250,4270,4235,...,0,0,0,0,0,0,0,0,0,0
1632,101,17991,1836,5893,323,8571,801,4014,122,122,...,0,0,0,0,0,0,0,0,0,1
1650,101,88488,168,22981,2870,2850,57798,156,168,7592,...,0,0,0,0,0,0,0,0,0,1
3860,101,168,90462,11989,230,230,12839,8419,7437,303,...,0,0,0,0,0,0,0,0,0,0


In [26]:
padded = data_red.drop('positive', axis='columns').values
padded

array([[  101,   168,  2651, ...,     0,     0,     0],
       [  101,   168,   254, ...,     0,     0,     0],
       [  101,   168, 95280, ...,     0,     0,     0],
       ...,
       [  101, 88488,   168, ...,     0,     0,     0],
       [  101,   168, 90462, ...,     0,     0,     0],
       [  101, 88488,   168, ...,     0,     0,     0]])

In [27]:
padded.shape

(400, 133)

In [28]:
batch_size = 100
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [32]:
features = np.concatenate(embeddings)


# обучите и протестируйте модель
# < напишите код здесь >

train_x_col, train_y_col, test_x_col, test_y_col = train_test_split(features, data_red['positive'], 
                                                                    test_size=0.5, random_state=42)
len(train_x_col) + len(test_x_col) == len(features)

True

In [35]:
lr_model = LogisticRegression(solver='liblinear')

cv_score = cross_val_score(lr_model, features, data_red['positive'], cv=5, scoring='accuracy')
cv_score

array([0.90123457, 0.81481481, 0.85      , 0.79746835, 0.87341772])

In [34]:
cv_score.mean()

0.8473870917330834