# Описание проекта

В нашем распоряжении датасет с размечеными комментариями от пользователей `toxic_comments.csv`. Основная задача проекта: `Обучить модель классифицировать комментарии на негативные и позитивные`. Условие выполнения задачи: `Построить модель со значением метрики качества F1 не меньше 0.75`. В ноутбуке я использовал предобученноую модель BERT, код которой можно посмотреть в моём ноутбуке на Google Colab: [Модель BERT в Google Colab](https://colab.research.google.com/drive/1iocPsw7WVDOo1_94PXHoloRzuiwN04DX 'Модель BERT в Google Colab')

# Импорты и загрузка данных

In [3]:
import pandas as pd
import numpy as np
import re
from joblib import dump, load

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import spacy
import en_core_web_sm

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import f1_score, mean_absolute_error as mae
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

from tqdm.notebook import tqdm

In [5]:
data = pd.read_csv('/datasets/toxic_comments.csv')                                # full dataframe
X_bert_balanced = np.array(pd.read_csv('/datasets/features_bert_balanced.csv'))   # BERT features with class balance
y_bert_balanced = pd.read_csv('/datasets/df_bert_balanced.csv')['toxic']          # BERT target with class balance

# display
print(data.shape)                # full dataframe
print(X_bert_balanced.shape)     # BERT features with class balance
print(y_bert_balanced.shape)     # BERT target with class balance

(159571, 2)
(450, 768)
(450,)


С BERT я работал в Google Colab. Посмотреть ноутбук можно по ссылке:

[Модель BERT в Google Colab](https://colab.research.google.com/drive/1iocPsw7WVDOo1_94PXHoloRzuiwN04DX 'Модель BERT в Google Colab')

# Предобработка

In [4]:
def clean(text):
    
    text = text.lower()    
    text = re.sub(r"(?:\n|\r)", " ", text)
    text = re.sub(r"[^a-zA-Z ]+", "", text).strip()
    
    return text

data['text_clean'] = data['text'].apply(clean)
display(data.head())

Unnamed: 0,text,toxic,text_clean
0,Explanation\nWhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,D'aww! He matches this background colour I'm s...,0,daww he matches this background colour im seem...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man im really not trying to edit war its j...
3,"""\nMore\nI can't make any real suggestions on ...",0,more i cant make any real suggestions on impro...
4,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember wh...


In [5]:
nlp = en_core_web_sm.load()

def lemmatize(text):
    temp = []
    for token in nlp(text):
        if token.is_stop == False:
            temp.append(token.lemma_)
    return " ".join(temp)


tqdm.pandas()
data['text_lemma'] = data['text_clean'].progress_apply(lemmatize)

  from pandas import Panel


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=159571.0), HTML(value='')))




In [6]:
dump(data, '/datasets/toxic_comments_ppg.csv')
data = load('/datasets/toxic_comments_ppg.csv')

Я сохраню предобработанные данные в файл для последующей загрузки, чтобы избежать повторного процесса лемматизации при повторном запуске ноутбука.

# Баланс в классах

In [7]:
# посмотрим на баланс классов в таргете
print(data['toxic'].value_counts(normalize=True))    # full dataframe
print(y_bert_balanced.value_counts(normalize=True))  # BERT target with class balance

0    0.898321
1    0.101679
Name: toxic, dtype: float64
1    0.555556
0    0.444444
Name: toxic, dtype: float64


Для балансировки классов я буду использовать технику увеличения выборки

In [7]:
def upsample(X, y, repeat):
    X_zeros = X[y == 0]
    X_ones = X[y == 1]
    y_zeros = y[y == 0]    
    y_ones = y[y == 1]    
    
    X_upsampled = pd.concat(
        [X_zeros] + [X_ones] * repeat)    
    y_upsampled = pd.concat(
        [y_zeros] + [y_ones] * repeat)    
    X_upsampled, y_upsampled = shuffle(        
        X_upsampled, y_upsampled, random_state=42) 
    
    return X_upsampled, y_upsampled

In [8]:
X = data['text_lemma']
y = data['toxic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

X_train, y_train = upsample(X_train, y_train, repeat=6)

print(y_train.value_counts(normalize=True))    # train class balance info
print(y_test.value_counts(normalize=True))     # test class balance info
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

0    0.898253
1    0.101747
Name: toxic, dtype: float64
0    0.595608
1    0.404392
Name: toxic, dtype: float64
(180508,) (180508,)
(39893,) (39893,)



# 2. Обучение

# Logit + TF IDF

In [69]:
pipe = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
    ('model', LogisticRegression(max_iter=150, solver='liblinear', random_state=42))
])

params = {
    'vectorizer__ngram_range': [(1, 1), (3, 1), (5, 2)],
    'model': [LogisticRegression(max_iter=150, solver='liblinear', random_state=42)],
    'model__C': [1, 5, 10, 25],
}

In [70]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid=params, scoring='f1', cv=cv, n_jobs=-1)

In [71]:
%%time
grid.fit(X_train, y_train)
display(grid.best_estimator_, grid.best_params_, grid.best_score_)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model',
                 LogisticRegression(C=25, max_iter=150, random_state=42,
                                    solver='liblinear'))])

{'model': LogisticRegression(C=25, max_iter=150, random_state=42, solver='liblinear'),
 'model__C': 25,
 'vectorizer__ngram_range': (1, 1)}

0.9735486184306842

Wall time: 1min 6s


In [72]:
lr = grid.best_estimator_
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
f1_score(predictions, y_test)

0.7675321523901966

С помощью подбора параметров нам удалось достич метрики качества F1 = 0.75. Посмотрим как работает Logit + BERT

# Logit + BERT

In [73]:
X_train_bert, X_test_bert = train_test_split(X_bert_balanced, test_size=.5, random_state=42)
y_train_bert, y_test_bert = train_test_split(y_bert_balanced, test_size=.5, random_state=42)

In [74]:
pipe = Pipeline([
    ('model', LogisticRegression(max_iter=150, solver='liblinear', random_state=42))
])

params = [
        {
            'model': [LogisticRegression(max_iter=1000, solver='liblinear', \
                                         random_state=42, class_weight='balanced')],
            'model__C': [1, 5, 10, 25],
        }
]

In [75]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(pipe, param_grid=params, scoring='f1', cv=cv, n_jobs=-1)

In [76]:
%%time
grid.fit(X_train_bert, y_train_bert)
display(grid.best_estimator_, grid.best_params_, grid.best_score_)

Pipeline(steps=[('model',
                 LogisticRegression(C=1, class_weight='balanced', max_iter=1000,
                                    random_state=42, solver='liblinear'))])

{'model': LogisticRegression(C=1, class_weight='balanced', max_iter=1000, random_state=42,
                    solver='liblinear'),
 'model__C': 1}

0.8602532658783453

Wall time: 487 ms


In [77]:
bert = grid.best_estimator_
bert.fit(X_train_bert, y_train_bert)
predictions = bert.predict(X_test_bert)
f1_score(predictions, y_test_bert)

0.8515625

# Общий вывод

У нас получилось достич нужного рейтинга F1. Модель Logit + BERT показала результат F1 = 0.86, это лучше чем Logit + TF IDF F1 = 0.76 на 0.10!