<a href="https://colab.research.google.com/github/maxGrigorenko/DistributionClassifier/blob/maxGrigorenko%2Ffirst_part/src/experiments_first_part_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Классификация распределений. Первая часть проекта
## 3. Построение множества $\mathscr{A}$ и оценка ошибки


In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import product
from scipy import stats as st


from graph_common_functions import *
from distibution_functions import *

In [3]:
sigma = 1
beta = (1/2) ** 0.5

In [38]:
def construct_A(n, d, number_of_experiments=1000, alpha=0.05, verbose=True, from_err2=False):
    A = []

    err = 1

    while err > alpha:
        t = 0
        f = 0
        results = []
        for t in range(number_of_experiments):
            normal_array = generate_normal(sigma, n)
            g = distance_graph_constructor(normal_array, d)
            dominating_number = g.compute_dominating_number(d)
            if dominating_number in A:
                t += 1
            else:
                f += 1
                results.append(dominating_number)

        err = f/number_of_experiments
        if verbose:
            print(f"error={err}")

        if len(results) == 0:
            return A
        res = st.mode(results)
        A.append(int(res.mode))

    return sorted(A)

In [42]:
def estimate_A(n, d, A, number_of_experiments=1000):
    t = 0
    f = 0
    for _ in range(number_of_experiments):
        laplace_array = generate_laplace(beta, n)
        g = distance_graph_constructor(laplace_array, d)
        dominating_number = g.compute_dominating_number(d)
        if dominating_number not in A:
            t += 1
        else:
            f += 1

    return t/number_of_experiments

for n in range(5, 101, 5):
    best_A = []
    best_d = 0.0
    max_p = 0
    for d_inx in range(1, 40, 3):
        d = d_inx/10
        A = construct_A(n=n, d=d, number_of_experiments=1000, alpha=0.05, verbose=False)
        p = estimate_A(n, d, A)
        if p >= max_p:
            best_A = A
            best_d = d
            max_p = p

    print(f'n={n}, max_p={max_p}, best_d={best_d}, best_A={best_A}')

n=5, max_p=0.023, best_d=3.7, best_A=[1]
n=10, max_p=0.044, best_d=3.1, best_A=[1]
n=15, max_p=0.038, best_d=3.4, best_A=[1]
n=20, max_p=0.046, best_d=3.4, best_A=[1]
n=25, max_p=0.033, best_d=0.7, best_A=[2, 3, 4, 5]
n=30, max_p=0.08, best_d=3.4, best_A=[1]
n=35, max_p=0.087, best_d=3.4, best_A=[1]
n=40, max_p=0.08, best_d=1.3, best_A=[1, 2, 3]
n=45, max_p=0.081, best_d=3.7, best_A=[1]
n=50, max_p=0.1, best_d=0.4, best_A=[4, 5, 6, 7, 8]
n=55, max_p=0.087, best_d=3.7, best_A=[1]
n=60, max_p=0.099, best_d=3.7, best_A=[1]
n=65, max_p=0.08, best_d=2.2, best_A=[2, 1]
n=70, max_p=0.194, best_d=1.9, best_A=[1, 2]
n=75, max_p=0.199, best_d=1.9, best_A=[1, 2]
n=80, max_p=0.205, best_d=1.9, best_A=[1, 2]
n=85, max_p=0.226, best_d=1.9, best_A=[1, 2]
n=90, max_p=0.229, best_d=1.9, best_A=[1, 2]
n=95, max_p=0.237, best_d=1.9, best_A=[1, 2]
n=100, max_p=0.233, best_d=1.9, best_A=[1, 2]


Видно, что даже при рассмотрении множества различных d, мощность критерия не больше 25%, а при $n < 50$ не превосходит и 10%, что говорит о малой эффективности классификатора.