In [1]:
import time
import math

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_fscore_support

from tqdm.notebook import tqdm

# Пункт 1 и 2

In [2]:
df = pd.read_csv("../assets/annotated-corpus/test-embeddings.tsv", sep="\t", header=None, index_col=False)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11911 entries, 0 to 11910
Columns: 101 entries, 0 to 100
dtypes: float64(100), object(1)
memory usage: 9.2+ MB


In [4]:
df["target"] = df[0].str.rsplit("_", n=1, expand=True)[0]
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,target
0,age_3351,-0.268966,-0.195137,0.185221,0.442006,-1.399686,0.458122,0.090845,-0.372571,-0.536477,...,-0.228859,0.308409,0.602842,0.037475,-0.383144,-0.151171,0.243271,0.118566,-0.137218,age
1,age_11616,-0.540151,0.308859,0.03182,0.343899,-0.737027,0.908944,-0.476058,-0.00194,-0.186527,...,-0.021206,0.141861,-0.207191,0.167495,0.345437,-0.090632,0.256034,0.3574,0.161623,age
2,age_1546,-0.277256,-0.396026,0.232861,0.595492,-1.076954,0.648078,0.039423,-0.139098,-0.631634,...,-0.121855,-0.362199,0.621111,-0.396075,-0.874889,-0.38904,0.304413,0.077354,-0.220831,age
3,age_6229,0.269636,-0.93357,0.436701,0.543866,-1.554088,0.582454,0.190633,-0.799669,-0.566312,...,-0.397914,1.153079,0.111551,0.063922,-1.197466,0.065416,0.369039,0.540426,0.092124,age
4,age_8562,-0.684108,-1.384038,0.142103,0.670762,-1.951569,0.886495,-0.431131,-0.198965,-0.063547,...,-0.4494,-0.210606,-0.299428,-0.684055,-0.388931,-0.294863,0.478414,-0.463894,0.867589,age


In [5]:
label_encoder = LabelEncoder()
df["target_enc"] = label_encoder.fit_transform(df["target"])

In [6]:
df[["target", "target_enc"]]

Unnamed: 0,target,target_enc
0,age,0
1,age,0
2,age,0
3,age,0
4,age,0
...,...,...
11906,not_cyberbullying,3
11907,not_cyberbullying,3
11908,not_cyberbullying,3
11909,not_cyberbullying,3


In [7]:
df["target"].unique(), df["target_enc"].unique()

(array(['age', 'ethnicity', 'gender', 'religion', 'other_cyberbullying',
        'not_cyberbullying'], dtype=object),
 array([0, 1, 2, 5, 4, 3]))

In [9]:
def confusion_matrix(true, pred):
    classes = set(true + pred)
    num_classes = len(classes)
    mat = np.zeros((num_classes, num_classes))
    n = max(len(true), len(pred))
    for i in range(num_classes):
        for j in range(num_classes):
            for k in range(n):
                if true[k] == i:
                    if pred[k] == j:
                        mat[i][j] = mat[i][j] + 1
    return mat

In [10]:
def get_precision_recall_fscore_accuracy(cm, beta=1.0):
    true_pos = np.diag(cm)
    false_pos = np.sum(cm, axis=0) - true_pos
    false_neg = np.sum(cm, axis=1) - true_pos
    
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)

    numerator = (1 + math.pow(beta, 2)) * recall * precision
    denominator = (math.pow(beta, 2) * precision) + recall

    fscore = numerator / denominator

    accuracy = true_pos / np.sum(cm, axis=1)

    return precision, recall, fscore, accuracy

In [11]:
def recall_precision(matrix_df, level = 'micro'):
    arr = matrix_df.to_numpy() # Total Number of Instance

    rows = np.sum(arr, axis = 1) # Sum of rows of each class (TP(i) + FN(i))
    columns = np.sum(arr, axis = 0) # Sum of columns of each class (TP(i) + FP(i))

    diagonals = np.diag(arr) # Get the diagonals 

    if (level == 'micro'):
        # sum of TP(i) / sum of (TP(i) + FN(i))
        recall = sum(diagonals) * 100 / sum(rows)
        # sum of TP(i) / sum of (TP(i) + FP(i))
        precision = sum(diagonals) * 100 / sum(columns)
    elif (level == 'macro'):
        # sum of recall(i) / c
        recall = sum((diagonals / rows)) * 100 / len(diagonals)
        # sum of precision(i) / c
        precision = sum((diagonals / columns)) * 100 / len(diagonals)
    elif (level == 'weighted'):
        # sum of recall(i) * true proportion of the class
        recall = sum((diagonals / rows) * (rows / np.sum(arr))) * 100
        # sum of precision(i) * true proportion of the class
        precision = sum((diagonals / columns) * (rows / np.sum(arr))) * 100

    return recall, precision


def compute_f_score(recall, precision, beta = 1.0):
    numerator = (1 + math.pow(beta, 2)) * recall * precision
    denominator = (math.pow(beta, 2) * precision) + recall

    return numerator/denominator


def compute_accuracy(matrix_df, predictions):
    accuracy = matrix_df.to_numpy().trace() * 100 / len(predictions)
    return accuracy


def get_precision_recall_fscore_accuracy_v2(cm, level="macro", beta=1.0):
    recall, precision = recall_precision(cm, level)
    

In [18]:
params = [
    { 
        "kernel": ["linear"],
        "C": [0.75, 1.0, 1.25],
        "gamma": ["scale", "auto"]
    },
    { 
        "kernel": ["poly"],
        "degree": [3],
        "C": [1.0, 1.25, 1.5],
        "gamma": ["scale", "auto"]
        # "class_weight": [None, "balanced"]
    },
    {
        "kernel": ["rbf"],
        "C": [1.0, 1.25, 1.5],
        "gamma": ["scale", "auto"]
        # "class_weight": [None, "balanced"]
    },
    {
        "kernel": ["sigmoid"],
        "C": [0.5, 0.75, 1.0],
        "gamma": ["scale", "auto"]
    }
]

param_grid = ParameterGrid(params)

In [19]:
X = df[df.columns.difference([0, 'target', 'target_enc'])]
y = df["target_enc"]

In [20]:
metrics = {
    "accuracy": dict(),
    "precision": dict(),
    "recall": dict(),
    "fscore" : dict(),
    "exec_time": dict()
}

metrics_names = ["accuracy", "precision", "recall", "fscore", "exec_time"]
for i, param in tqdm(enumerate(param_grid)):
    clf = SVC(**param)
    
    start_time = time.time()
    clf.fit(X, y)
    exec_time = time.time() - start_time
    
    y_pred = clf.predict(X)

    cm = confusion_matrix(y.tolist(), y_pred.tolist())
    pr, rec, fscore, acc = get_precision_recall_fscore_accuracy(cm)

    print(f"Model version №{i + 1}")
    print("params", param)
    for metr, name in zip([acc, pr, rec, fscore, exec_time], metrics_names):
        metrics[name][f"model_{i + 1}"] = metr
        print(name, np.mean(metr))

0it [00:00, ?it/s]

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №1
params {'C': 0.75, 'gamma': 'scale', 'kernel': 'linear'}
accuracy 0.7575182907689778
precision 0.7578145068863561
recall 0.7575182907689778
fscore 0.7564940982370721
exec_time 5.318994998931885


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №2
params {'C': 0.75, 'gamma': 'auto', 'kernel': 'linear'}
accuracy 0.7575182907689778
precision 0.7578145068863561
recall 0.7575182907689778
fscore 0.7564940982370721
exec_time 5.4707677364349365


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №3
params {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
accuracy 0.7584559340822224
precision 0.7589317844773887
recall 0.7584559340822224
fscore 0.7575092290863429
exec_time 6.240263223648071


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №4
params {'C': 1.0, 'gamma': 'auto', 'kernel': 'linear'}
accuracy 0.7584559340822224
precision 0.7589317844773887
recall 0.7584559340822224
fscore 0.7575092290863429
exec_time 6.519863128662109


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №5
params {'C': 1.25, 'gamma': 'scale', 'kernel': 'linear'}
accuracy 0.7584540935616509
precision 0.7589605744384441
recall 0.7584540935616509
fscore 0.7575143800754446
exec_time 7.6739161014556885


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №6
params {'C': 1.25, 'gamma': 'auto', 'kernel': 'linear'}
accuracy 0.7584540935616509
precision 0.7589605744384441
recall 0.7584540935616509
fscore 0.7575143800754446
exec_time 7.653799295425415


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №7
params {'C': 1.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
accuracy 0.7109729674785519
precision 0.8149905334845265
recall 0.7109729674785519
fscore 0.7232371877194845
exec_time 7.226579904556274


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №8
params {'C': 1.0, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
accuracy 0.5569432649258276
precision 0.7780057780544644
recall 0.5569432649258276
fscore 0.5882141721299154
exec_time 7.683886766433716


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №9
params {'C': 1.25, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
accuracy 0.7251107003087713
precision 0.821397294161832
recall 0.7251107003087713
fscore 0.7367419063422398
exec_time 8.516120195388794


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №10
params {'C': 1.25, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
accuracy 0.5686360301668375
precision 0.7874954898926315
recall 0.5686360301668375
fscore 0.5963555005526118
exec_time 7.385817050933838


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №11
params {'C': 1.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
accuracy 0.7389810290527876
precision 0.8282615963639754
recall 0.7389810290527876
fscore 0.7504539030624686
exec_time 8.44064736366272


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №12
params {'C': 1.5, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
accuracy 0.5794958921028682
precision 0.7895635053776671
recall 0.5794958921028682
fscore 0.6044096168733303
exec_time 6.741919040679932


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №13
params {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
accuracy 0.8180435059743814
precision 0.8212260237235164
recall 0.8180435059743814
fscore 0.8174269658064276
exec_time 3.937371015548706


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №14
params {'C': 1.0, 'gamma': 'auto', 'kernel': 'rbf'}
accuracy 0.7817702368903762
precision 0.7839301936065723
recall 0.7817702368903762
fscore 0.7806181015031971
exec_time 3.8035740852355957


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №15
params {'C': 1.25, 'gamma': 'scale', 'kernel': 'rbf'}
accuracy 0.8266405825208069
precision 0.8293059509755781
recall 0.8266405825208069
fscore 0.8259810127810215
exec_time 3.9707419872283936


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №16
params {'C': 1.25, 'gamma': 'auto', 'kernel': 'rbf'}
accuracy 0.7872646228829278
precision 0.7892973157993944
recall 0.7872646228829278
fscore 0.7861244118856
exec_time 3.846315860748291


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №17
params {'C': 1.5, 'gamma': 'scale', 'kernel': 'rbf'}
accuracy 0.8333432147696346
precision 0.8361782536584718
recall 0.8333432147696346
fscore 0.832782672012491
exec_time 4.008533954620361


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №18
params {'C': 1.5, 'gamma': 'auto', 'kernel': 'rbf'}
accuracy 0.7918677993445554
precision 0.7940265574619582
recall 0.7918677993445554
fscore 0.7908167540480893
exec_time 3.7263071537017822


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №19
params {'C': 0.5, 'gamma': 'scale', 'kernel': 'sigmoid'}
accuracy 0.6609456639373699
precision 0.6725816566124317
recall 0.6609456639373699
fscore 0.6640911290865018
exec_time 3.5650041103363037


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №20
params {'C': 0.5, 'gamma': 'auto', 'kernel': 'sigmoid'}
accuracy 0.7117636158898767
precision 0.7109639437673287
recall 0.7117636158898767
fscore 0.710056966839169
exec_time 4.156222105026245


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №21
params {'C': 0.75, 'gamma': 'scale', 'kernel': 'sigmoid'}
accuracy 0.6432489141082411
precision 0.6603731973627661
recall 0.6432489141082411
fscore 0.6487113128370509
exec_time 3.2867889404296875


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №22
params {'C': 0.75, 'gamma': 'auto', 'kernel': 'sigmoid'}
accuracy 0.697974335133401
precision 0.6978018060832382
recall 0.697974335133401
fscore 0.697674053720097
exec_time 3.975247859954834


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №23
params {'C': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
accuracy 0.6352340566259573
precision 0.6468693849345032
recall 0.6352340566259573
fscore 0.6385478336222832
exec_time 3.004000425338745


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №24
params {'C': 1.0, 'gamma': 'auto', 'kernel': 'sigmoid'}
accuracy 0.6913751677767334
precision 0.6899422174913368
recall 0.6913751677767334
fscore 0.6905091728505265
exec_time 3.677698850631714


In [30]:
def grid_search(X, y, param_grid):
    metrics = {
        "accuracy": dict(),
        "precision": dict(),
        "recall": dict(),
        "fscore" : dict(),
        "exec_time": dict()
    }

    model_params = dict()
    
    metrics_names = ["accuracy", "precision", "recall", "fscore", "exec_time"]
    for i, param in tqdm(enumerate(param_grid)):
        clf = SVC(**param)
        
        start_time = time.time()
        clf.fit(X, y)
        exec_time = time.time() - start_time
        
        y_pred = clf.predict(X)
    
        cm = confusion_matrix(y.tolist(), y_pred.tolist())
        pr, rec, fscore, acc = get_precision_recall_fscore_accuracy(cm)
    
        print(f"Model version №{i + 1}")
        print("params", param)
        for metr, name in zip([acc, pr, rec, fscore, exec_time], metrics_names):
            metrics[name][f"model_{i + 1}"] = metr
            print(name, np.mean(metr))

        model_params[f"model_{i + 1}"] = param

    return metrics, model_params
    

In [28]:
def find_best_model_by_metrics(metric_model, metrics_names):
    for name in metrics_names:
        k, v = max(metric_model[name].items(), key=lambda x: np.mean(x[1]))
        print(f"Metric {name}: model {k} with mean value {np.mean(v)}")
    

In [31]:
metrics_1, model_params_1 = grid_search(X, y, param_grid)

0it [00:00, ?it/s]

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №1
params {'C': 0.75, 'gamma': 'scale', 'kernel': 'linear'}
accuracy 0.7575182907689778
precision 0.7578145068863561
recall 0.7575182907689778
fscore 0.7564940982370721
exec_time 5.8660407066345215


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №2
params {'C': 0.75, 'gamma': 'auto', 'kernel': 'linear'}
accuracy 0.7575182907689778
precision 0.7578145068863561
recall 0.7575182907689778
fscore 0.7564940982370721
exec_time 5.709137201309204


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №3
params {'C': 1.0, 'gamma': 'scale', 'kernel': 'linear'}
accuracy 0.7584559340822224
precision 0.7589317844773887
recall 0.7584559340822224
fscore 0.7575092290863429
exec_time 6.3301331996917725


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №4
params {'C': 1.0, 'gamma': 'auto', 'kernel': 'linear'}
accuracy 0.7584559340822224
precision 0.7589317844773887
recall 0.7584559340822224
fscore 0.7575092290863429
exec_time 6.476785898208618


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №5
params {'C': 1.25, 'gamma': 'scale', 'kernel': 'linear'}
accuracy 0.7584540935616509
precision 0.7589605744384441
recall 0.7584540935616509
fscore 0.7575143800754446
exec_time 7.41474986076355


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №6
params {'C': 1.25, 'gamma': 'auto', 'kernel': 'linear'}
accuracy 0.7584540935616509
precision 0.7589605744384441
recall 0.7584540935616509
fscore 0.7575143800754446
exec_time 8.094722032546997


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №7
params {'C': 1.0, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
accuracy 0.7109729674785519
precision 0.8149905334845265
recall 0.7109729674785519
fscore 0.7232371877194845
exec_time 6.645483016967773


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №8
params {'C': 1.0, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
accuracy 0.5569432649258276
precision 0.7780057780544644
recall 0.5569432649258276
fscore 0.5882141721299154
exec_time 6.914137125015259


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №9
params {'C': 1.25, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
accuracy 0.7251107003087713
precision 0.821397294161832
recall 0.7251107003087713
fscore 0.7367419063422398
exec_time 8.198580980300903


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №10
params {'C': 1.25, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
accuracy 0.5686360301668375
precision 0.7874954898926315
recall 0.5686360301668375
fscore 0.5963555005526118
exec_time 6.911543130874634


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №11
params {'C': 1.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
accuracy 0.7389810290527876
precision 0.8282615963639754
recall 0.7389810290527876
fscore 0.7504539030624686
exec_time 8.032352924346924


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №12
params {'C': 1.5, 'degree': 3, 'gamma': 'auto', 'kernel': 'poly'}
accuracy 0.5794958921028682
precision 0.7895635053776671
recall 0.5794958921028682
fscore 0.6044096168733303
exec_time 6.760808229446411


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №13
params {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
accuracy 0.8180435059743814
precision 0.8212260237235164
recall 0.8180435059743814
fscore 0.8174269658064276
exec_time 3.8983800411224365


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №14
params {'C': 1.0, 'gamma': 'auto', 'kernel': 'rbf'}
accuracy 0.7817702368903762
precision 0.7839301936065723
recall 0.7817702368903762
fscore 0.7806181015031971
exec_time 3.9311397075653076


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №15
params {'C': 1.25, 'gamma': 'scale', 'kernel': 'rbf'}
accuracy 0.8266405825208069
precision 0.8293059509755781
recall 0.8266405825208069
fscore 0.8259810127810215
exec_time 3.7493607997894287


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №16
params {'C': 1.25, 'gamma': 'auto', 'kernel': 'rbf'}
accuracy 0.7872646228829278
precision 0.7892973157993944
recall 0.7872646228829278
fscore 0.7861244118856
exec_time 3.8946681022644043


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №17
params {'C': 1.5, 'gamma': 'scale', 'kernel': 'rbf'}
accuracy 0.8333432147696346
precision 0.8361782536584718
recall 0.8333432147696346
fscore 0.832782672012491
exec_time 3.9636809825897217


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №18
params {'C': 1.5, 'gamma': 'auto', 'kernel': 'rbf'}
accuracy 0.7918677993445554
precision 0.7940265574619582
recall 0.7918677993445554
fscore 0.7908167540480893
exec_time 3.524508237838745


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №19
params {'C': 0.5, 'gamma': 'scale', 'kernel': 'sigmoid'}
accuracy 0.6609456639373699
precision 0.6725816566124317
recall 0.6609456639373699
fscore 0.6640911290865018
exec_time 3.570081949234009


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №20
params {'C': 0.5, 'gamma': 'auto', 'kernel': 'sigmoid'}
accuracy 0.7117636158898767
precision 0.7109639437673287
recall 0.7117636158898767
fscore 0.710056966839169
exec_time 4.474938869476318


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №21
params {'C': 0.75, 'gamma': 'scale', 'kernel': 'sigmoid'}
accuracy 0.6432489141082411
precision 0.6603731973627661
recall 0.6432489141082411
fscore 0.6487113128370509
exec_time 3.175370931625366


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №22
params {'C': 0.75, 'gamma': 'auto', 'kernel': 'sigmoid'}
accuracy 0.697974335133401
precision 0.6978018060832382
recall 0.697974335133401
fscore 0.697674053720097
exec_time 3.8829052448272705


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №23
params {'C': 1.0, 'gamma': 'scale', 'kernel': 'sigmoid'}
accuracy 0.6352340566259573
precision 0.6468693849345032
recall 0.6352340566259573
fscore 0.6385478336222832
exec_time 2.89493989944458


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model version №24
params {'C': 1.0, 'gamma': 'auto', 'kernel': 'sigmoid'}
accuracy 0.6913751677767334
precision 0.6899422174913368
recall 0.6913751677767334
fscore 0.6905091728505265
exec_time 3.6250030994415283


In [35]:
find_best_model_by_metrics(metrics_1, ["accuracy", "precision", "recall", "fscore"])

Metric accuracy: model model_17 with mean value 0.8333432147696346
Metric precision: model model_17 with mean value 0.8361782536584718
Metric recall: model model_17 with mean value 0.8333432147696346
Metric fscore: model model_17 with mean value 0.832782672012491


In [38]:
best_model_metrics = { metric_name: val for metric_name, d in metrics.items() for model_name, val in d.items() if model_name == "model_17" }

In [61]:
best_model_metrics

{'accuracy': array([0.93953252, 0.95639247, 0.78165711, 0.621661  , 0.76255708,
        0.93825911]),
 'precision': array([0.90415648, 0.94654242, 0.89659295, 0.70874862, 0.64952463,
        0.91150442]),
 'recall': array([0.93953252, 0.95639247, 0.78165711, 0.621661  , 0.76255708,
        0.93825911]),
 'fscore': array([0.92150511, 0.95144195, 0.83518931, 0.66235446, 0.70151692,
        0.92468828]),
 'exec_time': 4.008533954620361}

# Пункт 3

In [48]:
X_log = X.copy().apply(np.log).fillna(0.0)
X_sin = X.copy().apply(np.sin)

In [52]:
metrics_names = ["accuracy", "precision", "recall", "fscore", "exec_time"]

In [56]:
def fit_predict(X, y, model_params):
    clf = SVC(**model_params_1["model_17"])
    start_time = time.time()
    clf.fit(X, y)
    exec_time = time.time() - start_time
    
    y_pred = clf.predict(X)
    cm = confusion_matrix(y.tolist(), y_pred.tolist())
    pr, rec, fscore, acc = get_precision_recall_fscore_accuracy(cm)
    
    metrics = dict()
    for metr, name in zip([acc, pr, rec, fscore, exec_time], metrics_names):
        metrics[name] = metr
        print(name, np.mean(metr))

    return metrics

In [58]:
log_model_metrics = fit_predict(X_log, y, model_params_1["model_17"])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


accuracy 0.8821543768388674
precision 0.8825037814110784
recall 0.8821543768388674
fscore 0.8816307546242369
exec_time 9.05287504196167


In [59]:
sin_model_metrics = fit_predict(X_sin, y, model_params_1["model_17"])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


accuracy 0.875356590941268
precision 0.8778817492599628
recall 0.875356590941268
fscore 0.8750483402065377
exec_time 3.8480849266052246
