**Импорт необходимых библиотек**

In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from tqdm import tqdm
from urllib.parse import urlparse
from matplotlib.colors import LinearSegmentedColormap
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import(
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report, 
    confusion_matrix,
)
from seaborn import (
    heatmap, 
    barplot, 
    countplot, 
    set_style,
    color_palette,
    set_palette,
    histplot
)

Для красоты

In [2]:
warnings.filterwarnings("ignore")

def get_gradient_palette(cmap, n_colors):
    return [cmap(i/n_colors) for i in range(n_colors)]

colors = ["#fd7e14", "#495057"]
gradient_cmap = LinearSegmentedColormap.from_list("orange_black_grad", colors, N=256)

set_style("whitegrid")
palette = color_palette("rocket");

**Загружаем данные**

In [3]:
train = pd.read_csv("Data/train.csv")

Посмотрим на наш датасет

In [5]:
train = train.drop_duplicates()

In [6]:
X = train["url"]
y = train["result"]

In [7]:
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [8]:
tokenizer = RegexpTokenizer(r'[\w\d]+')

In [9]:
vectorizer = TfidfVectorizer(max_features=2048)

In [10]:
X_vec = vectorizer.fit_transform(X.apply(lambda x: ' '.join(tokenizer.tokenize(x))))

In [20]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, f1_score

In [21]:
random_state = 42

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, 
    y, 
    random_state=random_state, 
    test_size=0.2, 
    shuffle=True
)

In [23]:
# Создание словаря с диапазонами значений для древесных моделей
param_grid_tree = {
    'criterion': ["gini", "entropy", "log_loss"],
    'max_depth': list(range(3, 11)) + [None], 
    'random_state': [random_state],
}

In [24]:
# Исчерпывающий поиск гиперпараметров по сетке (Grid Search) для XGBClassifier по заданным диапазонам значений 
gs = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_tree, scoring=make_scorer(accuracy_score), cv=3)
gs.fit(X_train, y_train)

In [26]:
# Обучение CatBoostClassifier с лучшими гиперпараметрами на всей обучающей выборке и подсчет метрики на тестовой выборке
model = RandomForestClassifier(**gs.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# print(f'CatBoostClassifier: {(y_test, y_pred, average='macro')}')

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      8015
           1       0.92      0.88      0.90      4766

    accuracy                           0.93     12781
   macro avg       0.92      0.92      0.92     12781
weighted avg       0.93      0.93      0.92     12781



In [28]:
# Исчерпывающий поиск гиперпараметров по сетке (Grid Search) для XGBClassifier по заданным диапазонам значений 
gs = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_tree, scoring=make_scorer(f1_score), cv=3)
gs.fit(X_train, y_train)

In [29]:
# Обучение CatBoostClassifier с лучшими гиперпараметрами на всей обучающей выборке и подсчет метрики на тестовой выборке
model = RandomForestClassifier(**gs.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# print(f'CatBoostClassifier: {(y_test, y_pred, average='macro')}')

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      8015
           1       0.92      0.88      0.90      4766

    accuracy                           0.93     12781
   macro avg       0.92      0.92      0.92     12781
weighted avg       0.93      0.93      0.92     12781



In [None]:
# Обучение CatBoostClassifier с лучшими гиперпараметрами на всей обучающей выборке и подсчет метрики на тестовой выборке
model = RandomForestClassifier(**gs.best_params_)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      8015
           1       0.93      0.88      0.91      4766

    accuracy                           0.93     12781
   macro avg       0.93      0.92      0.93     12781
weighted avg       0.93      0.93      0.93     12781



In [49]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [48]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95      8015
           1       0.93      0.88      0.91      4766

    accuracy                           0.93     12781
   macro avg       0.93      0.92      0.93     12781
weighted avg       0.93      0.93      0.93     12781



In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90      8015
           1       0.85      0.80      0.82      4766

    accuracy                           0.87     12781
   macro avg       0.86      0.85      0.86     12781
weighted avg       0.87      0.87      0.87     12781



In [None]:
# Вывод лучших гиперпараметров
gs.best_params_()

In [11]:
# Обучение CatBoostClassifier с лучшими гиперпараметрами на всей обучающей выборке и подсчет метрики на тестовой выборке
model = RandomForestClassifier()
model.fit(X_vec, y)