# Определение токсичности комментариев

Интернет-магазин запускает новый сервис, где пользователи смогут редактировать и дополнять описания товаров. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. 

В распоряжении набор данных с разметкой о токсичности правок.

## Подготовка

In [1]:
#импортируем библиотеки
!pip install lightgbm
import pandas as pd
import nltk
import re
import time 
import os
import matplotlib.pyplot as plt
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#отключаем предупреждения
import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Дмитрий\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Дмитрий\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Дмитрий\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
#открываем и изучаем файл
pth1 = 'C:\\Users\\Дмитрий\\Desktop\\Jupyter Notebook\\Project\\toxic_comments.csv'
pth2 = '/datasets/toxic_comments.csv'
    
if os.path.exists(pth1):
    data = pd.read_csv(pth1)
elif os.path.exists(pth2):
    data = pd.read_csv(pth2)
else:
    print('Something is wrong')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    159571 non-null  object
 1   toxic   159571 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [5]:
data['toxic'].value_counts(normalize=True)

0    0.898321
1    0.101679
Name: toxic, dtype: float64

Практически 90% сообщений нетоксичного характера. Классы несбаласированы, значит в моделировании будем применять балансировку.

In [6]:
#сводим текст твитов к нижнему регистру
data['text'] = data['text'].str.lower()

In [7]:
#проверяем на дубликаты
data.isna().sum()

text     0
toxic    0
dtype: int64

In [8]:
#создаем функцию для лемматизации и удаления символов
lemmatizer = WordNetLemmatizer()
def get_lemmas(sentence):
    word_list = nltk.word_tokenize(sentence)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    x = re.sub(r'[^a-zA-Z]', ' ', lemmatized_output)
    return ' '.join(x.split())

In [9]:
%%time

#применяем функцию
data['lemmas'] = data['text'].apply(get_lemmas)

CPU times: total: 2min 25s
Wall time: 2min 26s


In [10]:
display(data)

Unnamed: 0,text,toxic,lemmas
0,explanation\nwhy the edits made under my usern...,0,explanation why the edits made under my userna...
1,d'aww! he matches this background colour i'm s...,0,d aww he match this background colour i m seem...
2,"hey man, i'm really not trying to edit war. it...",0,hey man i m really not trying to edit war it s...
3,"""\nmore\ni can't make any real suggestions on ...",0,more i ca n t make any real suggestion on impr...
4,"you, sir, are my hero. any chance you remember...",0,you sir are my hero any chance you remember wh...
...,...,...,...
159566,""":::::and for the second time of asking, when ...",0,and for the second time of asking when your vi...
159567,you should be ashamed of yourself \n\nthat is ...,0,you should be ashamed of yourself that is a ho...
159568,"spitzer \n\numm, theres no actual article for ...",0,spitzer umm there no actual article for prosti...
159569,and it looks like it was actually you who put ...,0,and it look like it wa actually you who put on...


In [11]:
#разделяем выборки на обучающие и тестовые 
features_train, features_test, target_train, target_test = train_test_split(data['lemmas'], 
                                                                            data['toxic'],
                                                                            test_size=0.2,
                                                                            random_state=12345)

print(features_train.shape)
print(features_test.shape)
print(target_train.shape)
print(target_test.shape)

(127656,)
(31915,)
(127656,)
(31915,)


In [12]:
#считаем TF-IDF 
stopwords = set(nltk_stopwords.words('english'))
count_tfidf = TfidfVectorizer(stop_words=stopwords)
features_train = count_tfidf.fit_transform(features_train)
features_test = count_tfidf.transform(features_test)
print(features_train.shape)
print(features_test.shape)

(127656, 141656)
(31915, 141656)


In [14]:
#настраиваем кросс-валидацию
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=12345)   

In [15]:
#создаем словарь для записи результатов
results = {'training_time': [],
           'pred_time': [],
           'cv_f1': [],
           'test_f1': []
          }

In [16]:
#создаем функцию для автоматизации моделирования и записи результатов
def modelling(model, features_train, target_train, features_test, target_test, cv_res):
    start_train = time.time()
    model.fit(features_train, target_train)
    stop_train = time.time()
    pred_test = model.predict(features_test)
    stop_test = time.time()
    results.setdefault('training_time',[]).append('{0:.3f}'.format(stop_train-start_train))
    results.setdefault('pred_time',[]).append('{0:.3f}'.format(stop_test-stop_train))
    results.setdefault('test_f1',[]).append('{0:.2f}'.format(f1_score(target_test, pred_test)))
    results.setdefault('cv_f1',[]).append('{0:.2f}'.format(abs(cv_res.mean())))
    display(results)

## Обучение

### Логистическая регрессия

In [17]:
model_lg = LogisticRegression(random_state=12345, class_weight='balanced')
cv_lg = cross_val_score(estimator=model_lg, 
                      X=features_train,
                      y=target_train, 
                      scoring='f1',
                      cv=folds, 
                      n_jobs=-1)

print(abs(cv_lg.mean()))

0.747085649773487


In [18]:
modelling(model_lg, features_train, target_train, features_test, target_test, cv_lg)

{'training_time': ['4.920'],
 'pred_time': ['0.004'],
 'cv_f1': ['0.75'],
 'test_f1': ['0.75']}

### Дерево решений

In [19]:
%%time

model_dt = DecisionTreeClassifier(random_state=12345,  class_weight='balanced')
parameters = {'max_depth': range(5,26,1)}
grid_dt = GridSearchCV(model_dt, param_grid=parameters, scoring='f1', n_jobs=-1, cv=folds)
grid_dt.fit(features_train, target_train)
cv_dt= grid_dt.best_score_
print(grid_dt.best_params_)
print(abs(grid_dt.best_score_))

{'max_depth': 25}
0.6163614888104212
CPU times: total: 23.5 s
Wall time: 10min 51s


In [20]:
modelling(grid_dt.best_estimator_, features_train, target_train, features_test, target_test, cv_dt)

{'training_time': ['4.920', '28.602'],
 'pred_time': ['0.004', '0.016'],
 'cv_f1': ['0.75', '0.62'],
 'test_f1': ['0.75', '0.61']}

### Случайный лес

In [21]:
%%time

model_rf = RandomForestClassifier(random_state=12345, class_weight='balanced')
parameters = {'max_depth': range(11,16,1), 'n_estimators' : [10, 20, 40, 60, 100, 200, 300, 500]}
grid_rf = GridSearchCV(model_rf, param_grid=parameters, scoring='f1', n_jobs=-1, cv=folds)
grid_rf.fit(features_train, target_train)
cv_rf= grid_rf.best_score_
print(grid_rf.best_params_)
print(abs(grid_rf.best_score_))

{'max_depth': 15, 'n_estimators': 500}
0.3924767346800335
CPU times: total: 39 s
Wall time: 53min 45s


In [22]:
modelling(grid_rf.best_estimator_, features_train, target_train, features_test, target_test, cv_rf)

{'training_time': ['4.920', '28.602', '34.744'],
 'pred_time': ['0.004', '0.016', '2.781'],
 'cv_f1': ['0.75', '0.62', '0.39'],
 'test_f1': ['0.75', '0.61', '0.39']}

### LGBM Classifier

In [23]:
%%time

model_lgbm = LGBMClassifier(n_jobs=-1, random_state=12345, class_weight='balanced')
parameters = {'learning_rate': [0.1, 0.2, 0.3],
              'max_depth': [3, 6, 10, 15],
              'n_estimators': [500, 1000, 1500]
             }
grid_lgbm = GridSearchCV(model_lgbm, param_grid=parameters, cv=folds, scoring='f1')

grid_lgbm.fit(features_train, target_train)
cv_lgbm = grid_lgbm.best_score_

print(grid_lgbm.best_params_)
print(grid_lgbm.best_score_)

{'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 1500}
0.7742267602204005
CPU times: total: 1d 9h 54min 59s
Wall time: 4h 17min 47s


In [24]:
modelling(grid_lgbm.best_estimator_, features_train, target_train, features_test, target_test, cv_lgbm)

{'training_time': ['4.920', '28.602', '34.744', '156.012'],
 'pred_time': ['0.004', '0.016', '2.781', '2.454'],
 'cv_f1': ['0.75', '0.62', '0.39', '0.77'],
 'test_f1': ['0.75', '0.61', '0.39', '0.77']}

### Метод ближайших соседей (KNeighbors)

In [25]:
%%time

model_kn = KNeighborsClassifier(n_jobs=-1)
parameters = {'n_neighbors': range(1, 3, 1),
              'weights': ['uniform', 'distance'],
             }
grid_kn = GridSearchCV(model_kn, param_grid=parameters, cv=folds, scoring='f1')

grid_kn.fit(features_train, target_train)
cv_kn = grid_kn.best_score_

print(grid_kn.best_params_)
print(grid_kn.best_score_)

{'n_neighbors': 1, 'weights': 'uniform'}
0.36122019624176477
CPU times: total: 1h 16min 32s
Wall time: 23min 29s


In [26]:
modelling(grid_kn.best_estimator_, features_train, target_train, features_test, target_test, cv_kn)

{'training_time': ['4.920', '28.602', '34.744', '156.012', '0.016'],
 'pred_time': ['0.004', '0.016', '2.781', '2.454', '104.273'],
 'cv_f1': ['0.75', '0.62', '0.39', '0.77', '0.36'],
 'test_f1': ['0.75', '0.61', '0.39', '0.77', '0.41']}

### Метод опорных векторов (SVM)

In [27]:
%%time

model_svc = SVC(class_weight='balanced', random_state=12345)
parameters = {'C': [0.1, 1, 10]}
grid_svc = GridSearchCV(model_svc, param_grid=parameters, cv=folds, scoring='f1', n_jobs=-1)

grid_svc.fit(features_train, target_train)
cv_svc = grid_svc.best_score_

print(grid_svc.best_params_)
print(grid_svc.best_score_)

{'C': 1}
0.7759230562433203
CPU times: total: 1h 5min 40s
Wall time: 4h 23min 37s


In [28]:
modelling(grid_svc.best_estimator_, features_train, target_train, features_test, target_test, cv_svc)

{'training_time': ['4.920',
  '28.602',
  '34.744',
  '156.012',
  '0.016',
  '3939.265'],
 'pred_time': ['0.004', '0.016', '2.781', '2.454', '104.273', '286.000'],
 'cv_f1': ['0.75', '0.62', '0.39', '0.77', '0.36', '0.78'],
 'test_f1': ['0.75', '0.61', '0.39', '0.77', '0.41', '0.79']}

## Выводы

In [29]:
#строим таблицу с результатами
columns = pd.Series(['Logistic_Regression', 'Decision_Tree', 'Random_Forest', 'LGBM_Classifier', 'KNeighbors', 'SVC'])
table = pd.DataFrame.from_dict(results, orient='index', columns=columns)
display(table)

Unnamed: 0,Logistic_Regression,Decision_Tree,Random_Forest,LGBM_Classifier,KNeighbors,SVC
training_time,4.92,28.602,34.744,156.012,0.016,3939.265
pred_time,0.004,0.016,2.781,2.454,104.273,286.0
cv_f1,0.75,0.62,0.39,0.77,0.36,0.78
test_f1,0.75,0.61,0.39,0.77,0.41,0.79


* Модель логистической регрессии имеет самую высокую скорость предсказания, на 2-ом месте по скорости обучения. Также эта модель имеет одну из самых высоких точностей предсказания (F1 примерно равно 0.75).

* Модель решающего дерева имеет высокую скорость предсказаний, но не отличается высокой точностью и скоростью обучения.

* Модель случайного леса долго обучается и имеет очень низкую точность.

* Модель градиентного бустинга LGBM относительно долго обучается и предсказывает, но имеет высокую точность предсказаний.

* Метод ближайших соседей предсказывает дольше всех и имеет самую низкую точность предсказаний.

* Метод опорных векторов в сотни раз дольше обучается и предсказывает, чем остальные. Но имеет самую высокую точность.

**Если у заказчика имеются высокие вычислительные мощности или скорость обучения/предсказания не важна, то стоит использовать метод опорных векторов (SVM). Если скорость важна, то градиентный бустинг LGBM. Остальные модели не имеют достаточной точности предсказаний.**