In [1]:
import string
import torch
import pandas as pd

import catboost
from catboost import CatBoostClassifier

from pymorphy2 import MorphAnalyzer

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maksimmigur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Загрузка данных и просмотр свойств набора данных

In [2]:
df = pd.read_csv('../data/labeled.csv')
df.head()

Unnamed: 0,comment,toxic
0,"Верблюдов-то за что? Дебилы, бл...\n",1.0
1,"Хохлы, это отдушина затюканого россиянина, мол...",1.0
2,Собаке - собачья смерть\n,1.0
3,"Страницу обнови, дебил. Это тоже не оскорблени...",1.0
4,"тебя не убедил 6-страничный пдф в том, что Скр...",1.0


## Предобработка текста

In [3]:
stop_words = stopwords.words('russian')
punctuation = string.punctuation
digits = string.digits
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")
vectorize_model = AutoModel.from_pretrained("cointegrated/rubert-tiny")

def remove_punctuation(text) -> str:
    without_digits = (
        ''.join([word for word in text if (word not in punctuation) and (word not in digits)])
    )
    return ' '.join([word for word in without_digits.split(' ') if word not in stop_words])

def lemmatize(text: str) -> str:
    pymorphy2_analyzer = MorphAnalyzer()
    return ' '.join([pymorphy2_analyzer.parse(word)[0].normal_form.strip() for word in text.split(' ')]).strip()

def vectorize(text: str):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = vectorize_model(**{key: value.to(vectorize_model.device) for key, value in t.items()})
    embeddings = model_output.last_hidden_state[:, :, :]
    embeddings = torch.nn.functional.normalize(embeddings)

    return embeddings.cpu().mean(dim = 1).squeeze(0).numpy()

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
df['comment'] = df['comment'].apply(remove_punctuation)
df['comment'] = df['comment'].apply(lemmatize)
df['comment'] = df['comment'].apply(vectorize)

In [5]:
def get_vectorized_text(text: str):
    text = remove_punctuation(text)
    text = lemmatize(text)
    text = vectorize(text)
    return text

## Построение фичей

In [6]:
vectorize_text = []

for vector in df['comment'].values:
    vectorize_text.append(vector)

In [7]:
df = pd.concat([df, pd.DataFrame(vectorize_text)], axis=1)

## Разделение выборок

In [8]:
X = df.drop(['toxic', 'comment'], axis=1)
y = df['toxic']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

## Построение catboost модели

In [9]:
model_clf = CatBoostClassifier().fit(X_train, y_train)

Learning rate set to 0.029263
0:	learn: 0.6813497	total: 93.2ms	remaining: 1m 33s
1:	learn: 0.6717303	total: 113ms	remaining: 56.2s
2:	learn: 0.6624574	total: 134ms	remaining: 44.4s
3:	learn: 0.6539466	total: 154ms	remaining: 38.3s
4:	learn: 0.6444940	total: 175ms	remaining: 34.8s
5:	learn: 0.6364572	total: 196ms	remaining: 32.4s
6:	learn: 0.6291148	total: 217ms	remaining: 30.8s
7:	learn: 0.6213955	total: 240ms	remaining: 29.8s
8:	learn: 0.6145187	total: 266ms	remaining: 29.3s
9:	learn: 0.6074229	total: 294ms	remaining: 29.1s
10:	learn: 0.6007267	total: 326ms	remaining: 29.3s
11:	learn: 0.5953671	total: 364ms	remaining: 30s
12:	learn: 0.5896939	total: 410ms	remaining: 31.1s
13:	learn: 0.5845809	total: 485ms	remaining: 34.2s
14:	learn: 0.5793929	total: 534ms	remaining: 35.1s
15:	learn: 0.5741518	total: 566ms	remaining: 34.8s
16:	learn: 0.5695429	total: 598ms	remaining: 34.5s
17:	learn: 0.5647550	total: 627ms	remaining: 34.2s
18:	learn: 0.5603755	total: 654ms	remaining: 33.8s
19:	learn: 

In [10]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(model_clf.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(model_clf.predict(X_test), y_test)}")
print(f"precision - {precision_score(model_clf.predict(X_test), y_test)}")
print(f"recall - {recall_score(model_clf.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.8331599028789456
precision - 0.6773162939297125
recall - 0.7813267813267813

confusion_matrix :
    true negative - 1766
    false positive - 303
    false negative - 178
    true positive - 636



## Построение дерева решений

In [11]:
tree_model = DecisionTreeClassifier().fit(X_train, y_train)

In [12]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(tree_model.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(tree_model.predict(X_test), y_test)}")
print(f"precision - {precision_score(tree_model.predict(X_test), y_test)}")
print(f"recall - {recall_score(tree_model.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.6923343739160597
precision - 0.5505857294994675
recall - 0.5264765784114053

confusion_matrix :
    true negative - 1479
    false positive - 422
    false negative - 465
    true positive - 517



## Построение Support Vector Machine

In [13]:
svm_model = SVC().fit(X_train, y_train)

In [14]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(svm_model.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(svm_model.predict(X_test), y_test)}")
print(f"precision - {precision_score(svm_model.predict(X_test), y_test)}")
print(f"recall - {recall_score(svm_model.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.8446063128685397
precision - 0.6943556975505857
recall - 0.8019680196801968

confusion_matrix :
    true negative - 1783
    false positive - 287
    false negative - 161
    true positive - 652



## Построение Ada boost classifier

In [15]:
ada_boost_clf_model = AdaBoostClassifier().fit(X_train, y_train)

In [16]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(ada_boost_clf_model.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(ada_boost_clf_model.predict(X_test), y_test)}")
print(f"precision - {precision_score(ada_boost_clf_model.predict(X_test), y_test)}")
print(f"recall - {recall_score(ada_boost_clf_model.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.7932708983697537
precision - 0.6293929712460063
recall - 0.7044100119189511

confusion_matrix :
    true negative - 1696
    false positive - 348
    false negative - 248
    true positive - 591



## Построение Bagging Clf

In [17]:
bagging_clf = BaggingClassifier().fit(X_train, y_train)

In [18]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(bagging_clf.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(bagging_clf.predict(X_test), y_test)}")
print(f"precision - {precision_score(bagging_clf.predict(X_test), y_test)}")
print(f"recall - {recall_score(bagging_clf.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.7790496011099549
precision - 0.5154419595314164
recall - 0.7267267267267268

confusion_matrix :
    true negative - 1762
    false positive - 455
    false negative - 182
    true positive - 484



## Построение Extra trees clf

In [19]:
extra_trees_clf_model = ExtraTreesClassifier().fit(X_train, y_train)

In [20]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(extra_trees_clf_model.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(extra_trees_clf_model.predict(X_test), y_test)}")
print(f"precision - {precision_score(extra_trees_clf_model.predict(X_test), y_test)}")
print(f"recall - {recall_score(extra_trees_clf_model.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.7998612556364898
precision - 0.5015974440894568
recall - 0.8120689655172414

confusion_matrix :
    true negative - 1835
    false positive - 468
    false negative - 109
    true positive - 471



## Построение Gradient Boosting Classifier

In [21]:
gradient_clf_model = GradientBoostingClassifier().fit(X_train, y_train)

In [22]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(gradient_clf_model.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(gradient_clf_model.predict(X_test), y_test)}")
print(f"precision - {precision_score(gradient_clf_model.predict(X_test), y_test)}")
print(f"recall - {recall_score(gradient_clf_model.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.815469996531391
precision - 0.630457933972311
recall - 0.7619047619047619

confusion_matrix :
    true negative - 1759
    false positive - 347
    false negative - 185
    true positive - 592



## Улучшение качества модели Cat boost

In [23]:
params = {
    'depth': 6,
    'iterations': 1000,
    'learning_rate': 0.004677, 
    'l2_leaf_reg': 6,
    'border_count': 32,
    'thread_count': 4
}

model_catboost = CatBoostClassifier(**params).fit(X_train, y_train)

0:	learn: 0.6913111	total: 11.9ms	remaining: 11.9s
1:	learn: 0.6897679	total: 22.1ms	remaining: 11s
2:	learn: 0.6880613	total: 32.4ms	remaining: 10.8s
3:	learn: 0.6865185	total: 42.7ms	remaining: 10.6s
4:	learn: 0.6847753	total: 52.5ms	remaining: 10.4s
5:	learn: 0.6831693	total: 62.5ms	remaining: 10.4s
6:	learn: 0.6814383	total: 74.8ms	remaining: 10.6s
7:	learn: 0.6798027	total: 87ms	remaining: 10.8s
8:	learn: 0.6781694	total: 98.3ms	remaining: 10.8s
9:	learn: 0.6766161	total: 111ms	remaining: 11s
10:	learn: 0.6749039	total: 125ms	remaining: 11.2s
11:	learn: 0.6733436	total: 138ms	remaining: 11.4s
12:	learn: 0.6717415	total: 152ms	remaining: 11.5s
13:	learn: 0.6702050	total: 165ms	remaining: 11.6s
14:	learn: 0.6685737	total: 179ms	remaining: 11.7s
15:	learn: 0.6670000	total: 192ms	remaining: 11.8s
16:	learn: 0.6654214	total: 208ms	remaining: 12s
17:	learn: 0.6637115	total: 224ms	remaining: 12.2s
18:	learn: 0.6621519	total: 241ms	remaining: 12.4s
19:	learn: 0.6607101	total: 253ms	remain

In [24]:
true_neagtive, false_positive, false_negative, true_positive = confusion_matrix(model_catboost.predict(X_test), y_test).ravel()

print(f"accuracy - {accuracy_score(model_catboost.predict(X_test), y_test)}")
print(f"precision - {precision_score(model_catboost.predict(X_test), y_test)}")
print(f"recall - {recall_score(model_catboost.predict(X_test), y_test)}")
print(
f"""
confusion_matrix :
    true negative - {true_neagtive}
    false positive - {false_positive}
    false negative - {false_negative}
    true positive - {true_positive}
"""
)

accuracy - 0.8106139438085328
precision - 0.6006389776357828
recall - 0.7673469387755102

confusion_matrix :
    true negative - 1773
    false positive - 375
    false negative - 171
    true positive - 564



## Сохранение модели

In [29]:
model_catboost.save_model('model', format="cbm")