<a href="https://colab.research.google.com/github/maxGrigorenko/DistributionClassifier/blob/maxGrigorenko%2Fsecond_part/src/normal_laplace/experiments_second_part.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Классификация распределений. Второая часть проекта,
## Построение собственного классификатора

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import product
from scipy import stats as st

from graph_common_functions import *
from distibution_functions import *

In [33]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

class DistribituionClassifier:
    def __init__(self, n, clf, sigma=1, beta=(1/2) ** 0.5):
        self.n = n
        self.sigma = sigma
        self.beta = beta
        self.clf = clf


    def make_data(self, number_of_experiments):
        n = self.n

        data = pd.DataFrame()

        sub_data = []
        features = ['delta', 'mean_degree', 'max_degree', 'dominating_number', 'clique_number']

        for disrtibution in 'normal', 'laplace':
            for _ in range(number_of_experiments):
                if disrtibution == 'normal':
                    array = generate_normal(self.sigma, n)
                else:
                    array = generate_laplace(self.beta, n)

                features_d = []

                for d in [0.3, 1.0, 2.0, 3.5]:
                    g = distance_graph_constructor(array, d)
                    delta = g.compute_delta()
                    mean_degree = g.compute_mean_degree()
                    max_degree = g.compute_max_degree()
                    dominating_number = g.compute_dominating_number(d)
                    clique_number = g.compute_clique_number(d)
                    features_d.append([delta, mean_degree, max_degree, dominating_number, clique_number])

                params = dict()
                for i in range(len(features_d)):
                    for j, f in enumerate(features):
                        params.update({f"{f}_{i}": features_d[i][j]})

                params.update({"distribution": disrtibution})
                sub_data.append(params)

        data = pd.concat([data, pd.DataFrame(sub_data)], ignore_index=True)
        return data

    def fit(self, number_of_experiments=1000):
        data = self.make_data(number_of_experiments=number_of_experiments)
        X = data.drop('distribution', axis=1)
        y = data['distribution']
        encoder = LabelEncoder()
        encoder.fit(y)
        y = encoder.transform(y)
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)

    def estimate(self, number_of_test=100):
        data = self.make_data(number_of_experiments=number_of_test)
        X_test = data.drop('distribution', axis=1)
        y = data['distribution']
        encoder = LabelEncoder()
        encoder.fit(y)
        y_test = encoder.transform(y)
        y_pred = np.array(self.clf.predict(X_test))
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        return {
            'accuracy': acc,
            'precision': precision,
            'recall': recall
        }


In [37]:
d_clf = DistribituionClassifier(n=100, clf=RandomForestClassifier())
d_clf.fit(number_of_experiments=5_000)

In [38]:
d_clf.estimate(1_000)

{'accuracy': 0.951, 'precision': 0.9492031872509961, 'recall': 0.953}