# Вторая часть проекта. Разработка модели

## 3. Расчет ошибки I рода и мощности для модели логистической регрессии

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances
from dataclasses import dataclass
from tqdm import tqdm
from itertools import product
import sys
import os

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from visualisations import *
from metrics import *
from classifier import *

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../common_tools')))
from graphs import KNN_Graph, Distance_Graph
from characterisctics_applied import *

In [2]:
columns = ['n', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'I_error', 'Power']

In [42]:
def get_model_quality(y_pred, y_test, title, n):

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)

    I_error = 1 - precision
    power = recall

    return {name : value for value, name in zip([n, accuracy, precision, recall, f1, roc_auc, I_error, power], columns)}

In [4]:
def fit_logistic_regression_model(n, observations_count=1000):
    clfr = DistibutionClassifier(n=n, observations_count=observations_count)
    clfr.generate_important_chars_points()
    X_train, y_train = clfr.get_points_dataset()

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('logistic_regression', LogisticRegression())
    ])
    model.fit(X_train, y_train.to_numpy().flatten().ravel())

    return model

In [47]:
quality = pd.DataFrame(columns=columns)

In [None]:
for n, m in zip([50, 100, 500], [700, 200, 1500]):
    model = fit_logistic_regression_model(n, m)

    for _ in range(10):
        clfr_tester = DistibutionClassifier(n=n, observations_count=200)
        clfr_tester.generate_important_chars_points()
        X_test, y_test = clfr_tester.get_points_dataset()

        if n > 50:
            y_pred = model.predict(X_test)
        else:
            y_pred_proba = model.predict_proba(X_test)
            y_pred = (y_pred_proba[:, 1] > 0.8).astype(int)
        quality_row = get_model_quality(y_pred, y_test, f"Random forest model metrics (n = {n})", n)

        quality.loc[len(quality)] = list(quality_row.values())

In [59]:
quality[quality['n'] == 50].describe().iloc[[3, 1, 7], [6, 7]]

Unnamed: 0,I_error,Power
min,0.019868,0.615
mean,0.041746,0.6705
max,0.058824,0.74


In [60]:
quality[quality['n'] == 100].describe().iloc[[3, 1, 7], [6, 7]]

Unnamed: 0,I_error,Power
min,0.0,0.945
mean,0.014645,0.969
max,0.029557,0.985


In [61]:
quality[quality['n'] == 500].describe().iloc[[3, 1, 7], [6, 7]]

Unnamed: 0,I_error,Power
min,0.0,1.0
mean,0.0,1.0
max,0.0,1.0
