In [None]:
import time
import math

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import precision_recall_fscore_support

from tqdm.notebook import tqdm

# Пункт 1 и 2

In [None]:
df = pd.read_csv("../assets/annotated-corpus/test-embeddings.tsv", sep="\t", header=None, index_col=False)

In [None]:
df.info()

In [None]:
df["target"] = df[0].str.rsplit("_", n=1, expand=True)[0]
df.head()

In [None]:
label_encoder = LabelEncoder()
df["target_enc"] = label_encoder.fit_transform(df["target"])

In [None]:
df[["target", "target_enc"]]

In [None]:
df["target"].unique(), df["target_enc"].unique()

In [None]:
def confusion_matrix(true, pred):
    classes = set(true + pred)
    num_classes = len(classes)
    mat = np.zeros((num_classes, num_classes))
    n = max(len(true), len(pred))
    for i in range(num_classes):
        for j in range(num_classes):
            for k in range(n):
                if true[k] == i:
                    if pred[k] == j:
                        mat[i][j] = mat[i][j] + 1
    return mat

In [None]:
def get_precision_recall_fscore_accuracy(cm, beta=1.0):
    true_pos = np.diag(cm)
    false_pos = np.sum(cm, axis=0) - true_pos
    false_neg = np.sum(cm, axis=1) - true_pos
    
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)

    numerator = (1 + math.pow(beta, 2)) * recall * precision
    denominator = (math.pow(beta, 2) * precision) + recall

    fscore = numerator / denominator

    accuracy = true_pos / np.sum(cm, axis=1)

    return precision, recall, fscore, accuracy

In [None]:
def recall_precision(matrix_df, level = 'micro'):
    arr = matrix_df.to_numpy() # Total Number of Instance

    rows = np.sum(arr, axis = 1) # Sum of rows of each class (TP(i) + FN(i))
    columns = np.sum(arr, axis = 0) # Sum of columns of each class (TP(i) + FP(i))

    diagonals = np.diag(arr) # Get the diagonals 

    if (level == 'micro'):
        # sum of TP(i) / sum of (TP(i) + FN(i))
        recall = sum(diagonals) * 100 / sum(rows)
        # sum of TP(i) / sum of (TP(i) + FP(i))
        precision = sum(diagonals) * 100 / sum(columns)
    elif (level == 'macro'):
        # sum of recall(i) / c
        recall = sum((diagonals / rows)) * 100 / len(diagonals)
        # sum of precision(i) / c
        precision = sum((diagonals / columns)) * 100 / len(diagonals)
    elif (level == 'weighted'):
        # sum of recall(i) * true proportion of the class
        recall = sum((diagonals / rows) * (rows / np.sum(arr))) * 100
        # sum of precision(i) * true proportion of the class
        precision = sum((diagonals / columns) * (rows / np.sum(arr))) * 100

    return recall, precision


def compute_f_score(recall, precision, beta = 1.0):
    numerator = (1 + math.pow(beta, 2)) * recall * precision
    denominator = (math.pow(beta, 2) * precision) + recall

    return numerator/denominator


def compute_accuracy(matrix_df, predictions):
    accuracy = matrix_df.to_numpy().trace() * 100 / len(predictions)
    return accuracy


def get_precision_recall_fscore_accuracy_v2(cm, level="macro", beta=1.0):
    recall, precision = recall_precision(cm, level)
    

In [None]:
params = [
    { 
        "kernel": ["linear"],
        "C": [0.75, 1.0, 1.25],
        "gamma": ["scale", "auto"]
    },
    { 
        "kernel": ["poly"],
        "degree": [3],
        "C": [1.0, 1.25, 1.5],
        "gamma": ["scale", "auto"]
        # "class_weight": [None, "balanced"]
    },
    {
        "kernel": ["rbf"],
        "C": [1.0, 1.25, 1.5],
        "gamma": ["scale", "auto"]
        # "class_weight": [None, "balanced"]
    },
    {
        "kernel": ["sigmoid"],
        "C": [0.5, 0.75, 1.0],
        "gamma": ["scale", "auto"]
    }
]

param_grid = ParameterGrid(params)

In [None]:
X = df[df.columns.difference([0, 'target', 'target_enc'])]
y = df["target_enc"]

In [None]:
metrics = {
    "accuracy": dict(),
    "precision": dict(),
    "recall": dict(),
    "fscore" : dict(),
    "exec_time": dict()
}

metrics_names = ["accuracy", "precision", "recall", "fscore", "exec_time"]
for i, param in tqdm(enumerate(param_grid)):
    clf = SVC(**param)
    
    start_time = time.time()
    clf.fit(X, y)
    exec_time = time.time() - start_time
    
    y_pred = clf.predict(X)

    cm = confusion_matrix(y.tolist(), y_pred.tolist())
    pr, rec, fscore, acc = get_precision_recall_fscore_accuracy(cm)

    print(f"Model version №{i + 1}")
    print("params", param)
    for metr, name in zip([acc, pr, rec, fscore, exec_time], metrics_names):
        metrics[name][f"model_{i + 1}"] = metr
        print(name, np.mean(metr))

In [None]:
def grid_search(X, y, param_grid):
    metrics = {
        "accuracy": dict(),
        "precision": dict(),
        "recall": dict(),
        "fscore" : dict(),
        "exec_time": dict()
    }

    model_params = dict()
    
    metrics_names = ["accuracy", "precision", "recall", "fscore", "exec_time"]
    for i, param in tqdm(enumerate(param_grid)):
        clf = SVC(**param)
        
        start_time = time.time()
        clf.fit(X, y)
        exec_time = time.time() - start_time
        
        y_pred = clf.predict(X)
    
        cm = confusion_matrix(y.tolist(), y_pred.tolist())
        pr, rec, fscore, acc = get_precision_recall_fscore_accuracy(cm)
    
        print(f"Model version №{i + 1}")
        print("params", param)
        for metr, name in zip([acc, pr, rec, fscore, exec_time], metrics_names):
            metrics[name][f"model_{i + 1}"] = metr
            print(name, np.mean(metr))

        model_params[f"model_{i + 1}"] = param

    return metrics, model_params
    

In [None]:
def find_best_model_by_metrics(metric_model, metrics_names):
    for name in metrics_names:
        k, v = max(metric_model[name].items(), key=lambda x: np.mean(x[1]))
        print(f"Metric {name}: model {k} with mean value {np.mean(v)}")
    

In [None]:
metrics_1, model_params_1 = grid_search(X, y, param_grid)

In [None]:
find_best_model_by_metrics(metrics_1, ["accuracy", "precision", "recall", "fscore"])

In [None]:
best_model_metrics = { metric_name: val for metric_name, d in metrics.items() for model_name, val in d.items() if model_name == "model_17" }

In [None]:
best_model_metrics

# Пункт 3

In [None]:
X_log = X.copy().apply(np.log).fillna(0.0)
X_sin = X.copy().apply(np.sin)

In [None]:
metrics_names = ["accuracy", "precision", "recall", "fscore", "exec_time"]

In [None]:
def fit_predict(X, y, model_params):
    clf = SVC(**model_params_1["model_17"])
    start_time = time.time()
    clf.fit(X, y)
    exec_time = time.time() - start_time
    
    y_pred = clf.predict(X)
    cm = confusion_matrix(y.tolist(), y_pred.tolist())
    pr, rec, fscore, acc = get_precision_recall_fscore_accuracy(cm)
    
    metrics = dict()
    for metr, name in zip([acc, pr, rec, fscore, exec_time], metrics_names):
        metrics[name] = metr
        print(name, np.mean(metr))

    return metrics

In [None]:
log_model_metrics = fit_predict(X_log, y, model_params_1["model_17"])

In [None]:
sin_model_metrics = fit_predict(X_sin, y, model_params_1["model_17"])