# Проект - Викишоп

## Описание проекта

Интернет-магазин запускает новый сервис. Пользователи могут редактировать и дополнять описания товаров, как в вики-сообществах. То есть клиенты предлагают свои правки и комментируют изменения других. Магазину нужен инструмент, который будет искать токсичные комментарии и отправлять их на модерацию. 
Обучим модель классифицировать комментарии на позитивные и негативные. В нашем распоряжении набор данных с разметкой о токсичности правок.
Необходимо построить модель со значением метрики качества F1 не меньше 0.75. 

# Выполнение проекта

## Выгрузка необходимых библиотек

In [1]:
import math
import numpy as np
import pandas as pd
import re
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import f1_score,classification_report
from tqdm import tqdm
import lightgbm as lgb
import transformers
from tqdm import notebook
from catboost import CatBoostClassifier
from xgboost import XGBRegressor
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('stopwords')
nltk.download('punkt')




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kaz-106\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kaz-106\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kaz-106\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaz-106\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaz-106\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Загрузка данных

- Загрузим данные

In [2]:
try:
    data=pd.read_csv('C:/Users/kaz-106/YandexDisk/Py-projects/02_Practicum projects/11_Machine_learning_for_texts/toxic_comments.csv')
except:
    data=pd.read_csv('/datasets/toxic_comments.csv')

## Предобработка данных

In [3]:
print(data.dtypes)
print(data.info())
display(data.head())

Unnamed: 0     int64
text          object
toxic          int64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159292 entries, 0 to 159291
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  159292 non-null  int64 
 1   text        159292 non-null  object
 2   toxic       159292 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.6+ MB
None


Unnamed: 0.1,Unnamed: 0,text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


### Лемматизация, очистка от лишних символов и пробелов

In [4]:
def preprocessing(text):
    tokenized = nltk.word_tokenize(text)
    joined = ' '.join(tokenized)
    texted = re.sub(r'[^a-zA-Z]', ' ', joined)
    resulted = ' '.join(texted.split())
    return resulted

In [5]:
data['lemm_text'] = data['text'].apply(preprocessing)

In [6]:
lemmatizer = WordNetLemmatizer()

def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):

    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    lemmatized_sentence = []
    
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [7]:
tqdm.pandas()
data['lemm_text'] = data['text'].progress_apply(lemmatize_sentence) 

100%|█████████████████████████████████████████████████████████████████████████| 159292/159292 [10:18<00:00, 257.49it/s]


In [8]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,text,toxic,lemm_text
0,0,Explanation\nWhy the edits made under my usern...,0,Explanation Why the edits make under my userna...
1,1,D'aww! He matches this background colour I'm s...,0,D'aww ! He match this background colour I 'm s...
2,2,"Hey man, I'm really not trying to edit war. It...",0,"Hey man , I 'm really not try to edit war . It..."
3,3,"""\nMore\nI can't make any real suggestions on ...",0,`` More I ca n't make any real suggestion on i...
4,4,"You, sir, are my hero. Any chance you remember...",0,"You , sir , be my hero . Any chance you rememb..."


## Подготовка и обучение моделей

### Разделение датасетов

In [9]:
features = data['lemm_text']
target = data['toxic']
features_train,features_test,target_train,target_test=train_test_split(features,target, test_size=0.2,random_state=12345)

### Создание корпусов слов и установка стоп-слов

In [10]:
features_train_corpus = features_train.values
features_test_corpus = features_test.values

display(features_train_corpus)

stopwords = set(nltk_stopwords.words('english'))
display(len(stopwords))

array(["Bushranger you 're a GRASS with no sense of humour . Seen the South Park episode Poor and Stupid ? That 's what I be refer to , a comment obviously lose on you . But you 've show you ca n't fight your own battle and have to run cry to mummy - boo-hoo.90.204.13.4",
       "`` Need administrative help I have be block , iniquitously . Read the above request . I try to explain everything there . Hegemonic use of administrative privilege ought to not be overlook . Ask Administrator if I be harass him to such an extent where I need to be block from edit , that too for 48 hr . For goodness ' sake . What happen to do n't bite the newcomer ? Please talk to Bwilkins . See the talk page of he have a knack of be unreasonable with aplomb . here ``",
       "I 'd also like to point out that he have use a third-person plural pronoun and possessive pronoun to refer to one person ( me ) and one person 's ( my ) possession .",
       ...,
       "`` Agreed . We really should try to stick to the 

179

## Препроцессинг - векторизация

In [11]:
models=[]
results=[]

count_vect = CountVectorizer(stop_words=stopwords, dtype=np.float32)

bow_features_train = count_vect.fit_transform(features_train_corpus)
bow_features_test = count_vect.transform(features_test_corpus)

display(bow_features_train.shape)

(127433, 158268)

### Логистическая регрессия

In [12]:
#model_Logistic = LogisticRegression(max_iter=1000)
#model_Logistic.fit(bow_features_train,target_train)

#test_f1_score = f1_score(target_test, model_Logistic.predict(bow_features_test))

#print("Test F1 score (LogisticRegression):",test_f1_score)   
#models.append('Vect_LogisticRegression')
#results.append(test_f1_score)

In [13]:
def logreg(features_train, target_train, features_test, target_test):

    logreg = LogisticRegression()
    
    parameters = {'C': [0.1,1],
                 'solver': ['lbfgs','liblinear'],
                  'penalty': ['l2']}
    
    grid_search_logreg = GridSearchCV(logreg, parameters, cv=5, n_jobs=-1)
    
    grid_search_logreg.fit(features_train, target_train)
    
    print(grid_search_logreg.best_params_)
    print(classification_report(target_test, grid_search_logreg.predict(features_test)))

In [14]:
logreg(bow_features_train,target_train, bow_features_test, target_test)

{'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28629
           1       0.86      0.68      0.76      3230

    accuracy                           0.96     31859
   macro avg       0.91      0.83      0.87     31859
weighted avg       0.95      0.96      0.95     31859



### LGBM

In [15]:
#model_LGBM =lgb.LGBMClassifier(n_estimators = 1000, learning_rate = 0.05)
#model_LGBM.fit(bow_features_train,target_train)

#test_f1_score = f1_score(target_test, model_LGBM.predict(bow_features_test))

#print("Test F1 score (LGBM):",test_f1_score)
#models.append('Vect_LGBMClassifier')
#results.append(test_f1_score)

In [16]:
def lgbm(features_train, target_train, features_test, target_test):
    
    lgbm_model = lgb.LGBMClassifier()
    
    parameters = {'max_depth': [5,10],
                 'n_estimators': [5,10],
                  'learning_rate': [5,10]}
    
    grid_search_lgbm = GridSearchCV(lgbm_model, parameters, cv=5, n_jobs=-1)
    
    grid_search_lgbm.fit(features_train, target_train)
    
    print(grid_search_lgbm.best_params_)
    print(classification_report(target_test, grid_search_lgbm.predict(features_test)))

In [17]:
lgbm(bow_features_train,target_train, bow_features_test, target_test)

{'learning_rate': 5, 'max_depth': 10, 'n_estimators': 5}
              precision    recall  f1-score   support

           0       0.60      0.00      0.00     28629
           1       0.10      1.00      0.18      3230

    accuracy                           0.10     31859
   macro avg       0.35      0.50      0.09     31859
weighted avg       0.55      0.10      0.02     31859



In [18]:
#lr = lgb.LGBMClassifier()
#print(lr.get_params().keys())

### CatBoost

In [19]:
#CatBoost_model = CatBoostClassifier(learning_rate=0.1, max_depth=10,iterations=10)
#CatBoost_model.fit(bow_features_train, target_train)

#test_f1_score = f1_score(target_test, CatBoost_model.predict(bow_features_test))

#print("Test F1 score (LGBM):",test_f1_score)
#models.append('Vect_CatBoostClassifier')
#results.append(test_f1_score)

In [20]:
def catboost(features_train, target_train, features_test, target_test):
    
    cat_model = CatBoostClassifier()
    
    parameters = {'depth': [2,4],
                 'iterations': [5],
                  'learning_rate': [0.1,0.5]}
    
    grid_search_cat= GridSearchCV(cat_model, parameters, cv=5, n_jobs=-1)
    
    grid_search_cat.fit(features_train, target_train)
    
    print(grid_search_cat.best_params_)
    print(classification_report(target_test, grid_search_cat.predict(features_test)))

In [21]:
catboost(bow_features_train,target_train, bow_features_test, target_test)

0:	learn: 0.3431463	total: 670ms	remaining: 2.68s
1:	learn: 0.2699141	total: 1.23s	remaining: 1.84s
2:	learn: 0.2444803	total: 1.78s	remaining: 1.19s
3:	learn: 0.2312446	total: 2.34s	remaining: 585ms
4:	learn: 0.2247160	total: 2.85s	remaining: 0us
{'depth': 4, 'iterations': 5, 'learning_rate': 0.5}
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     28629
           1       0.93      0.32      0.48      3230

    accuracy                           0.93     31859
   macro avg       0.93      0.66      0.72     31859
weighted avg       0.93      0.93      0.91     31859



## Препроцессинг - TF-IDF

In [22]:
tfidf = TfidfVectorizer(stop_words=stopwords)

tfidf_features_train = tfidf.fit_transform(features_train_corpus)
tfidf_features_test = tfidf.transform(features_test_corpus)

display(tfidf_features_train.shape)

(127433, 158268)

### Логистическая регрессия

In [23]:
logreg(tfidf_features_train,target_train, tfidf_features_test, target_test)

{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28629
           1       0.93      0.61      0.74      3230

    accuracy                           0.96     31859
   macro avg       0.94      0.80      0.86     31859
weighted avg       0.95      0.96      0.95     31859



### LGBM

In [24]:
lgbm(tfidf_features_train,target_train, tfidf_features_test, target_test)

{'learning_rate': 5, 'max_depth': 5, 'n_estimators': 5}
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     28629
           1       0.80      0.13      0.23      3230

    accuracy                           0.91     31859
   macro avg       0.85      0.56      0.59     31859
weighted avg       0.90      0.91      0.88     31859



### CatBoost

In [25]:
catboost(tfidf_features_train,target_train, tfidf_features_test, target_test)

0:	learn: 0.3441069	total: 670ms	remaining: 2.68s
1:	learn: 0.2678566	total: 1.36s	remaining: 2.04s
2:	learn: 0.2447696	total: 2.01s	remaining: 1.34s
3:	learn: 0.2332886	total: 2.65s	remaining: 664ms
4:	learn: 0.2238455	total: 3.35s	remaining: 0us
{'depth': 4, 'iterations': 5, 'learning_rate': 0.5}
              precision    recall  f1-score   support

           0       0.93      1.00      0.96     28629
           1       0.95      0.35      0.52      3230

    accuracy                           0.93     31859
   macro avg       0.94      0.68      0.74     31859
weighted avg       0.93      0.93      0.92     31859



## Выводы

In [26]:
model_Logistic = LogisticRegression(C=1.0, penalty="l2", solver= 'lbfgs')
model_Logistic.fit(bow_features_train,target_train)

test_f1_score = f1_score(target_test, model_Logistic.predict(bow_features_test))

print("Test F1 score (LogisticRegression):",test_f1_score)   


Test F1 score (LogisticRegression): 0.7570836212854181


- Выполнена загрузка представленных текстовых данных и необходимых библиотек, загружены стоп-слова для соотвесвтующего языка (Английского)
- Проведена предобработка и анализ данных, текст лемматизирован, очищен от лишних символов и пробелов
- С помощью двух различных методов (векторизация и TF_IDF), подготовлены признаки для машинного обучения
- Обучены три различные модели (LogisticRegression, LGBM и CatBoost) для различных методов препроцессинга
- Наилучшее значение F1 показала модель LogisticRegression на CountVectoriser, это соотвествует критериям, указанных в условии задачи
- BERT не был использован из-за сложности модели и ограниченности вычислительной мощности