# Проект для «Викишоп» c BERT

### Описание проекта
Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.
Обучите модель классифицировать комментарии на позитивные и негативные. В вашем распоряжении набор данных с разметкой о токсичности правок.
Постройте модель со значением метрики качества F1 не меньше 0.75.


In [1]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.metrics import f1_score

import os
from lightgbm import LGBMClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
model_class, tokenizer_class, pretrained_weights = (transformers.DistilBertModel,
                                                    transformers.DistilBertTokenizer,
                                                    'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def get_data(filename):
    
    """
    Reads the data from the given path.
    """

    pth1 = f'/datasets/{filename}'
    pth2 = f'data/{filename}'

    if os.path.exists(pth1):
        df = pd.read_csv(pth1)
        print(f'Reading {pth1}...')
    elif os.path.exists(pth2):
        df = pd.read_csv(pth2)
        print(f'Reading {pth2}...')
    else:
        print('Check the file path')

    return df

### Загрузка данных

In [4]:
df = get_data('toxic_comments.csv')
# Эмбеддинги буду получать для подвыборки (иначе слишком долго)
df = df.sample(1000, random_state=42)

df.head()

Reading data/toxic_comments.csv...


Unnamed: 0,text,toxic
119105,"Geez, are you forgetful! We've already discus...",0
131631,Carioca RFA \n\nThanks for your support on my ...,0
125326,"""\n\n Birthday \n\nNo worries, It's what I do ...",0
111256,Pseudoscience category? \n\nI'm assuming that ...,0
83590,"(and if such phrase exists, it would be provid...",0


In [5]:
df.shape

(1000, 2)

### Токенизация и подготовка данных

In [6]:
tokenized = df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [8]:
# Извлечение эмбеддингов
batch_size = 200
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
    batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
    attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
    
    with torch.no_grad():
        batch_embeddings = model(batch, attention_mask=attention_mask_batch)
    
    embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
# # Сохраняю эмбеддинги
# with open('embeddings_1000.npy', 'wb') as f:
#     np.save(f, embeddings)
# with open('embeddings_1000.npy', 'rb') as f:
#     a = np.load(f)
# print(a)

In [10]:
features = np.concatenate(embeddings)

features.shape

In [12]:
# features = np.concatenate(embeddings)
targets = df['toxic']

train_features, test_features, train_target, test_target = train_test_split(features, targets,
                                                                            test_size=.5,
                                                                            random_state=42)

### Обучение моделей

In [13]:
train_target.value_counts(normalize=True)

0    0.884
1    0.116
Name: toxic, dtype: float64

> В обучающей выборке существенный дисбаланс классов

#### Линейная регрессия

In [14]:
lg_model = LogisticRegression(max_iter=500, class_weight='balanced')
lg_model.fit(train_features, train_target)

train_predict = lg_model.predict(train_features)
print(f'train F1: {f1_score(train_target, train_predict):.2f}')

test_predict = lg_model.predict(test_features)
print(f'test F1: {f1_score(test_target, test_predict):.2f}')

train F1: 0.94
test F1: 0.64


In [15]:
lg_CV_model = LogisticRegressionCV(cv=50, scoring='f1', max_iter=500, class_weight='balanced')
lg_CV_model.fit(train_features, train_target)

train_predict = lg_CV_model.predict(train_features)
print(f'train F1: {f1_score(train_target, train_predict):.2f}')

test_predict = lg_CV_model.predict(test_features)
print(f'test F1: {f1_score(test_target, test_predict):.2f}')

train F1: 0.97
test F1: 0.65


> Логистическая регрессия дает результат для метрики F1 на 0.1 ниже требуемого порогового

#### LGBM

In [16]:
model = LGBMClassifier(class_weight='balanced', n_estimators=500)
# Оценка качества модели
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, train_features, train_target, scoring='f1', cv=cv, n_jobs=-1)
# report performance
print(f'F1: {np.mean(n_scores):.2f}')

F1: 0.53


<div class="alert alert-block alert-warning">
<b> Комментарий студента</b>
    
В обсуждении прочитал, что требуется использовать пайплан, так как в противном случае на тренировочных данных будет пременяться векторизатор, обученный на всех исходных данных (test + train). Я так и не понял, как это реализовать, но пайплайны построить попробовал:

</div>

In [19]:
# В обсуждении прочитал, что требуется использовать пайплан, так как
# векторизатор обучается на всей выборке
pipeline = Pipeline([
    ('ss', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=500, class_weight='balanced'))
])

# Fit pipeline on training data
pipeline.fit(train_features, train_target)
pipeline_predictions = pipeline.predict(test_features)

# Score pipeline on testing data
print(f'test F1: {f1_score(test_target, pipeline_predictions):.2f}')

test F1: 0.56


In [18]:
# Create the pipeline object
# Note this is identical to the code above
polynomial_pipeline = Pipeline([
    # ('poly', PolynomialFeatures()),
    ('ss', StandardScaler()),
    ('logreg', LogisticRegressionCV(cv=20, scoring='f1', max_iter=500, class_weight='balanced'))
])

# Create new parameter dictionary
grid_params = {
    'logreg__penalty': ('l1', 'l2'),
}

# Instantiate new gridsearch object
gs_2 = GridSearchCV(polynomial_pipeline, grid_params)

# Fit model to our training data
gs_2.fit(train_features, train_target)
gs2_predictions = gs_2.predict(test_features)

# Score pipeline on testing data
print(f'test F1: {f1_score(test_target, gs2_predictions):.2f}')

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mike/opt/anaconda3/envs/practicum_new/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mike/opt/anaconda3/envs/practicum_new/lib/python3.10/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/mike/opt/anaconda3/envs/practicum_new/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 2031, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  Fi

test F1: 0.59


> Все другие реализованные методы даюр результат хуже, чем логистическая регрессия. 

<div class="alert alert-block alert-warning">
<b> Комментарий студента</b>
    
Я явно упускаю что-то важное, но попробую разобраться с твоими подсказками.

</div>