<center><h1>Проект для "Викишоп" с BERT

Интернет-магазин «Викишоп» запускает новый сервис. Теперь пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию.

# Подготовка

In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from tqdm import tqdm
import sys
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Col

In [3]:
import transformers
import transformers as ppb

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [5]:
data = pd.read_csv('https://code.s3.yandex.net/datasets/toxic_comments.csv', engine='python').drop('Unnamed: 0', axis=1)
data = data.sample(200).reset_index(drop=True)

In [6]:
data.head()

Unnamed: 0,text,toxic
0,Vampire hunter d splitting \n\nI don't really ...,0
1,You are receiving this message because you hav...,0
2,Bullfinger\nHaving just read your message on m...,0
3,"""\n\n Carol Kivler testimonial and mechanism o...",0
4,Thanks \n\nThanks for the tip! I don't know if...,0


In [44]:
model = AutoModelForMaskedLM.from_pretrained('distilbert-base-uncased')

In [8]:
!pip install torch



In [37]:
#tokenizer = transformers.BertTokenizer(vocab_file='vocab.txt')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.model_max_length = sys.maxsize
tokenized = data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [45]:
batch_size = 50
embeddings = []
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):

        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]).to(device)
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)]).to(device)

        with torch.no_grad():
          model.to(device)
          batch_embeddings = model(batch, attention_mask=attention_mask_batch)

        embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())

        del batch
        del attention_mask_batch
        del batch_embeddings

  0%|          | 0/4 [00:00<?, ?it/s]

In [41]:
features = np.concatenate(embeddings)

target = data['toxic']
features_train, features_test, target_train, target_test = train_test_split(features, target,
                                                                            test_size=0.2, random_state=12345, stratify=target)

# Обучение

In [42]:
scorer = make_scorer(f1_score, greater_is_better=False)
model = DecisionTreeClassifier()

grid_tree = GridSearchCV(estimator=model, cv=5, param_grid={'max_depth': range(1, 20)}, scoring=scorer)
grid_tree.fit(features_train, target_train)

grid_tree.best_score_ * (-1)

0.06666666666666667

In [27]:
model = RandomForestClassifier()
grid_forest = GridSearchCV(model, param_grid={'max_depth': range(1, 20), 'n_estimators': range(10, 31, 10)},
                           scoring=scorer, cv=5)
grid_forest.fit(features_train, target_train)
grid_forest.best_score_ * (-1)

-0.0

In [43]:
model = LogisticRegression(max_iter=1000)
grid_log = GridSearchCV(estimator=model, cv=5, param_grid={}, scoring=scorer)
grid_log.fit(features_train, target_train)
grid_log.best_score_ * (-1)

0.29333333333333333

Логистическая регрессия показала наилучший результат, поэтому проверим тестовую выборку на ней.

In [29]:
model = LogisticRegression(max_iter=1000)
model.fit(features_train, target_train)
predictions = model.predict(features_test)
f1_score(target_test, predictions)

0.5

# Выводы

Лучший результат показала логистическая регрессия: 0.5