In [None]:
from catboost import Pool
import pandas
from sklearn.model_selection import train_test_split

data = pandas.read_csv('normal.csv', sep=';')
terdata = pandas.read_csv('terror.csv', sep=';')
all_data = pandas.concat([data, terdata])

train_X = all_data['messages']
train_y = all_data['labels']

train_X, eval_X, train_y, eval_y = train_test_split(train_X, train_y, test_size=0.2, random_state=10)

text_features = [0]

import semantic
lemma_train_X = []
for text in train_X:
    lemma_text = semantic.lemmatize(str(text))
    lemma_train_X.append(lemma_text)

lemma_eval_X = []
for text in eval_X:
    lemma_text = semantic.lemmatize(str(text))
    lemma_eval_X.append(lemma_text)

train_pool = Pool(data=lemma_train_X,
                    label=train_y,
                    text_features=text_features)

eval_pool = Pool(data=lemma_eval_X, label=eval_y, text_features=text_features)

pandas.DataFrame(lemma_train_X)

In [None]:
from catboost import CatBoostClassifier

options = {
            "tokenizers" : [{
                "tokenizer_id" : "Sense",
                "separator_type" : "BySense",
                'lowercasing': 'True',
                'token_types':['Word', 'Number']
            }],

            "dictionaries" : [{
                "dictionary_id" : "Word",
                "max_dictionary_size" : "50000",
                "gram_order" : "1"
            }, 
            {
                "dictionary_id" : "BiGram",
                "max_dictionary_size" : "50000",
                "gram_order" : "2"
            },
            {
                "dictionary_id" : "TriGram",
                "max_dictionary_size" : "50000",
                "gram_order" : "3"
            }],
            "feature_processing" : {
                "default" : [{
                    "dictionaries_names" : ["Word", "BiGram"],
                    "feature_calcers" : ["BoW:top_tokens_count=1000", "NaiveBayes"],
                    "tokenizers_names" : ["Sense"]
                },
                {
                    "dictionaries_names" : ["Word"],
                    "feature_calcers" : ["NaiveBayes"],
                    "tokenizers_names" : ["Sense"]
                },
                {
                    "dictionaries_names" : ["Word", "BiGram"],
                    "feature_calcers" : ["BM25"],
                    "tokenizers_names" : ["Sense"]
                }]
            }
        }

# Initialize CatBoostClassifier
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.6,
    task_type="GPU",
    devices='0:1',
    loss_function='Logloss',
    eval_metric='AUC',
    custom_loss=['Accuracy'],
    random_strength=4,
    random_seed=7,
    text_processing=options
)

model.fit(train_pool, eval_set=eval_pool, use_best_model=True, plot=True)

In [None]:
model.tree_count_

In [None]:
model.plot_tree(tree_idx=1, pool=train_pool)

In [None]:
model.save_model('terror.model')

In [None]:
pred_y = model.predict(eval_pool)

import matplotlib.pyplot as plt
plt.hist([pred_y, eval_y])
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(pred_y, eval_y))

In [None]:
lemma_ter_X = []
for ter_text in terdata['messages']:
    lemma_ter_X.append(semantic.lemmatize(ter_text))

ter_pool = Pool(data=lemma_ter_X, text_features=text_features)
pred_y = model.predict(ter_pool)

import matplotlib.pyplot as plt
plt.hist([pred_y, terdata['labels']])
plt.show()

In [None]:
a = []
for i in range(len(pred_y)):
    if pred_y[i] == 0: a.append(terdata['messages'][i])

print(len(a), '\n', '\n'.join(a))

In [None]:
lem_text = semantic.lemmatize(input('Enter message: '))
print(lem_text)
input_pool = Pool([lem_text], text_features=text_features)
        
print(model.predict(input_pool))
print(model.predict_proba(input_pool))

In [None]:
lemma_ter_X = []
for ter_text in terdata['messages']:
    lemma_ter_X.append(semantic.lemmatize(ter_text))

ter_pool = Pool(data=lemma_ter_X, text_features=text_features)
pred_y = model.predict_proba(ter_pool)

chance_y = []
for p in pred_y:
    chance_y.append(p[1])

import matplotlib.pyplot as plt
plt.hist([chance_y, terdata['labels']])
plt.show()

In [None]:
from catboost import cv

params = {}
params['loss_function'] = 'Logloss'
params['iterations'] = 200
params['custom_loss'] = 'AUC'
params['random_seed'] = 40
params['learning_rate'] = 0.7

cv_data = cv(
    params=params,
    pool=train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=True,
    verbose=False
)

In [None]:
from catboost.utils import get_roc_curve
import sklearn
from sklearn import metrics

curve = get_roc_curve(model, eval_pool)
(fpr, tpr, thresholds) = curve
roc_auc = sklearn.metrics.auc(fpr, tpr)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 8))
lw = 2

plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc, alpha=0.5)

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=20)
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve

(thresholds, fpr) = get_fpr_curve(curve=curve)
(thresholds, fnr) = get_fnr_curve(curve=curve)

In [None]:
plt.figure(figsize=(16, 8))
lw = 2

plt.plot(thresholds, fpr, color='blue', lw=lw, label='FPR', alpha=0.5)
plt.plot(thresholds, fnr, color='green', lw=lw, label='FNR', alpha=0.5)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.grid(True)
plt.xlabel('Threshold', fontsize=16)
plt.ylabel('Error Rate', fontsize=16)
plt.title('FPR-FNR curves', fontsize=20)
plt.legend(loc="lower left", fontsize=16)
plt.show()

In [None]:
from catboost.utils import select_threshold

print(select_threshold(model=model, data=eval_pool, FNR=0.01))
print(select_threshold(model=model, data=eval_pool, FPR=0.01))