In [1]:
import sys

sys.path.append('../')

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

#transformers
from transformers import BertTokenizer
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel
from transformers import BertForSequenceClassification
from transformers import get_cosine_schedule_with_warmup
from transformers import AdamW

#torch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

# our functions
from src.data.make_dataset import final_dataset
from src.data.fix_balance import sampling_balance
from src.data.split_data import split
from src.data.encode_data import ohe_encoding
from src.data.vectorizer import text_vectorize

#set seed for reproducibility
SEED=42

In [3]:
data = pd.read_csv('/home/nfrvnikita/projects/service4classification/data/raw/BRFRD.csv')
data.head()

Unnamed: 0,review,rating,published,full review url
0,"«Головоломка» — продукт универсальный, и, чтоб...",90,2015-6-22,http://www.kino-mir.ru/posts/view/147
1,"А если подвести итоги, то Пиксар создал, не по...",100,2015-6-18,https://www.uralweb.ru/poster/reviews/6694.html
2,И все же плюсы «Головоломки» перевешивают ее м...,100,2015-6-19,http://www.tramvision.ru/recensia/2015/golovol...
3,"На выходе из зала есть ощущение, что Pixar сде...",100,2015-6-18,http://afisha.ngs.ru/news/more/2181412/
4,"Да, перед нами настоящий, старой школы пиксаро...",100,2015-6-16,http://www.kinokadr.ru/articles/2015/06/17/ins...


#### Генерация обработанного датасета

In [4]:
final_data = final_dataset(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['token_lens'] = token_lens


In [5]:
final_data['class'].value_counts()

class
2    15526
1     9706
0     2063
Name: count, dtype: int64

#### Дисбаланс классов

In [6]:
data_os = sampling_balance(final_data)

In [7]:
data_os['class'].value_counts()

class
2    15526
1    15526
0    15526
Name: count, dtype: int64

#### Сплит данных на трейн и валид датасеты

In [8]:
X_train, X_valid, y_train, y_valid = split(data_os)

#### OneHotEncoding

In [9]:
y_train, y_valid = ohe_encoding(y_train, y_valid)

#### Векторизация (TF-IDF, CountVectorizer)

In [10]:
X_train_tf, X_valid_tf = text_vectorize(X_train, X_valid)

In [11]:
print(len(final_data['class']))
print(len(final_data.index))

27295
27295


In [12]:
CLASSES = list(final_data['class'].unique())
labels = dict(zip(CLASSES, range(len(CLASSES))))
labels = [labels[label] for label in final_data['class']]

In [13]:
from src.data.make_dataset import BERTDataset
from src.models.train_model import BertClassifier

In [14]:
data = pd.read_csv('/home/nfrvnikita/projects/service4classification/data/raw/BRFRD.csv')
model_path = 'cointegrated/rubert-tiny'
tokenizer_path = 'cointegrated/rubert-tiny'
bert_tiny = BertClassifier(model_path, tokenizer_path, data, epochs=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
bert_tiny.preparation()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['token_lens'] = token_lens
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['token_lens'] = token_lens


In [16]:
bert_tiny.fit()

100%|██████████| 5801/5801 [01:40<00:00, 57.68it/s]
100%|██████████| 1024/1024 [00:07<00:00, 144.44it/s]


Epochs: 1 | Train Loss:  0.217                 | Train Accuracy:  0.576                     | Val Loss:  0.210                         | Val Accuracy:  0.590


100%|██████████| 5801/5801 [01:30<00:00, 64.21it/s]
100%|██████████| 1024/1024 [00:07<00:00, 145.29it/s]


Epochs: 2 | Train Loss:  0.195                 | Train Accuracy:  0.634                     | Val Loss:  0.206                         | Val Accuracy:  0.605


100%|██████████| 5801/5801 [01:30<00:00, 64.32it/s]
100%|██████████| 1024/1024 [00:07<00:00, 144.10it/s]


Epochs: 3 | Train Loss:  0.173                 | Train Accuracy:  0.693                     | Val Loss:  0.213                         | Val Accuracy:  0.604


100%|██████████| 5801/5801 [01:30<00:00, 64.15it/s]
100%|██████████| 1024/1024 [00:07<00:00, 144.36it/s]

Epochs: 4 | Train Loss:  0.157                 | Train Accuracy:  0.732                     | Val Loss:  0.220                         | Val Accuracy:  0.599





(17001, 3635.786056533456)