In [279]:
import numpy
import pandas
import scipy.stats as st
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import random
import sys
import numpy
import os
import scipy.stats
import math

In [280]:
sys.path.append("../continuous")
sys.path.append("../continuous/measurements")
sys.path.append("../utilities")
import ad_marsaglia as ad
from measurements import MEASUREMENTS

In [281]:
mpl.style.use("ggplot")

In [282]:
from distributions.alpha import ALPHA
from distributions.arcsine import ARCSINE
from distributions.argus import ARGUS
from distributions.beta import BETA
from distributions.beta_prime import BETA_PRIME
from distributions.beta_prime_4p import BETA_PRIME_4P
from distributions.bradford import BRADFORD
from distributions.burr import BURR
from distributions.burr_4p import BURR_4P
from distributions.cauchy import CAUCHY
from distributions.chi_square import CHI_SQUARE
from distributions.chi_square_3p import CHI_SQUARE_3P
from distributions.dagum import DAGUM
from distributions.dagum_4p import DAGUM_4P
from distributions.erlang import ERLANG
from distributions.erlang_3p import ERLANG_3P
from distributions.error_function import ERROR_FUNCTION
from distributions.exponential import EXPONENTIAL
from distributions.exponential_2p import EXPONENTIAL_2P
from distributions.f import F
from distributions.fatigue_life import FATIGUE_LIFE
from distributions.folded_normal import FOLDED_NORMAL
from distributions.frechet import FRECHET
from distributions.f_4p import F_4P
from distributions.gamma import GAMMA
from distributions.gamma_3p import GAMMA_3P
from distributions.generalized_extreme_value import GENERALIZED_EXTREME_VALUE
from distributions.generalized_gamma import GENERALIZED_GAMMA
from distributions.generalized_gamma_4p import GENERALIZED_GAMMA_4P
from distributions.generalized_logistic import GENERALIZED_LOGISTIC
from distributions.generalized_normal import GENERALIZED_NORMAL
from distributions.generalized_pareto import GENERALIZED_PARETO
from distributions.gibrat import GIBRAT
from distributions.gumbel_left import GUMBEL_LEFT
from distributions.gumbel_right import GUMBEL_RIGHT
from distributions.half_normal import HALF_NORMAL
from distributions.hyperbolic_secant import HYPERBOLIC_SECANT
from distributions.inverse_gamma import INVERSE_GAMMA
from distributions.inverse_gamma_3p import INVERSE_GAMMA_3P
from distributions.inverse_gaussian import INVERSE_GAUSSIAN
from distributions.inverse_gaussian_3p import INVERSE_GAUSSIAN_3P
from distributions.johnson_sb import JOHNSON_SB
from distributions.johnson_su import JOHNSON_SU
from distributions.kumaraswamy import KUMARASWAMY
from distributions.laplace import LAPLACE
from distributions.levy import LEVY
from distributions.loggamma import LOGGAMMA
from distributions.logistic import LOGISTIC
from distributions.loglogistic import LOGLOGISTIC
from distributions.loglogistic_3p import LOGLOGISTIC_3P
from distributions.lognormal import LOGNORMAL
from distributions.maxwell import MAXWELL
from distributions.moyal import MOYAL
from distributions.nakagami import NAKAGAMI
from distributions.nc_chi_square import NC_CHI_SQUARE
from distributions.nc_f import NC_F
from distributions.nc_t_student import NC_T_STUDENT
from distributions.normal import NORMAL
from distributions.pareto_first_kind import PARETO_FIRST_KIND
from distributions.pareto_second_kind import PARETO_SECOND_KIND
from distributions.pert import PERT
from distributions.power_function import POWER_FUNCTION
from distributions.rayleigh import RAYLEIGH
from distributions.reciprocal import RECIPROCAL
from distributions.rice import RICE
from distributions.semicircular import SEMICIRCULAR
from distributions.trapezoidal import TRAPEZOIDAL
from distributions.triangular import TRIANGULAR
from distributions.t_student import T_STUDENT
from distributions.t_student_3p import T_STUDENT_3P
from distributions.uniform import UNIFORM
from distributions.weibull import WEIBULL
from distributions.weibull_3p import WEIBULL_3P


In [283]:
_all_distributions = [
    ALPHA,
    ARCSINE,
    ARGUS,
    BETA,
    BETA_PRIME,
    BETA_PRIME_4P,
    BRADFORD,
    BURR,
    BURR_4P,
    CAUCHY,
    CHI_SQUARE,
    CHI_SQUARE_3P,
    DAGUM,
    DAGUM_4P,
    ERLANG,
    ERLANG_3P,
    ERROR_FUNCTION,
    EXPONENTIAL,
    EXPONENTIAL_2P,
    F,
    FATIGUE_LIFE,
    FOLDED_NORMAL,
    FRECHET,
    F_4P,
    GAMMA,
    GAMMA_3P,
    GENERALIZED_EXTREME_VALUE,
    GENERALIZED_GAMMA,
    GENERALIZED_GAMMA_4P,
    GENERALIZED_LOGISTIC,
    GENERALIZED_NORMAL,
    GENERALIZED_PARETO,
    GIBRAT,
    GUMBEL_LEFT,
    GUMBEL_RIGHT,
    HALF_NORMAL,
    HYPERBOLIC_SECANT,
    INVERSE_GAMMA,
    INVERSE_GAMMA_3P,
    INVERSE_GAUSSIAN,
    INVERSE_GAUSSIAN_3P,
    JOHNSON_SB,
    JOHNSON_SU,
    KUMARASWAMY,
    LAPLACE,
    LEVY,
    LOGGAMMA,
    LOGISTIC,
    LOGLOGISTIC,
    LOGLOGISTIC_3P,
    LOGNORMAL,
    MAXWELL,
    MOYAL,
    NAKAGAMI,
    NC_CHI_SQUARE,
    NC_F,
    NC_T_STUDENT,
    NORMAL,
    PARETO_FIRST_KIND,
    PARETO_SECOND_KIND,
    PERT,
    POWER_FUNCTION,
    RAYLEIGH,
    RECIPROCAL,
    RICE,
    SEMICIRCULAR,
    TRAPEZOIDAL,
    TRIANGULAR,
    T_STUDENT,
    T_STUDENT_3P,
    UNIFORM,
    WEIBULL,
    WEIBULL_3P,
]

In [284]:
def test_chi_square(data, distribution, measurements):
    ## Parameters and preparations
    N = measurements.length
    num_bins = measurements.num_bins
    frequencies, bin_edges = numpy.histogram(data, num_bins)
    freedom_degrees = num_bins - 1 - distribution.get_num_parameters()

    ## Calculation of errors
    errors = []
    for i, observed in enumerate(frequencies):
        lower = bin_edges[i]
        upper = bin_edges[i + 1]
        expected = N * (distribution.cdf(upper) - distribution.cdf(lower))
        errors.append(((observed - expected) ** 2) / expected)

    ## Calculation of indicators
    statistic_chi2 = sum(errors)
    critical_value = scipy.stats.chi2.ppf(0.95, freedom_degrees)
    p_value = 1 - scipy.stats.chi2.cdf(statistic_chi2, freedom_degrees)
    rejected = statistic_chi2 >= critical_value

    ## Construction of answer
    result_test_chi2 = {"test_statistic": statistic_chi2, "critical_value": critical_value, "p-value": p_value, "rejected": rejected}

    return result_test_chi2

In [285]:
def test_kolmogorov_smirnov(data, distribution, measurements):
    ## Parameters and preparations
    N = measurements.length
    data.sort()
    
    ## Calculation of errors
    errors = []
    for i in range(N):
        Sn = (i + 1) / N
        if i < N - 1:
            if (data[i] != data[i + 1]):
                Fn = distribution.cdf(data[i])
                errors.append(abs(Sn - Fn))
            else:
                Fn = 0
        else:
            Fn = distribution.cdf(data[i])
            errors.append(abs(Sn - Fn))
    
    ## Calculation of indicators
    statistic_ks = max(errors)
    critical_value = scipy.stats.kstwo.ppf(0.95, N)
    p_value = 1 -  scipy.stats.kstwo.cdf(statistic_ks, N)
    rejected = statistic_ks >= critical_value
    
    ## Construction of answer
    result_test_ks = {
        "test_statistic": statistic_ks, 
        "critical_value": critical_value, 
        "p-value": p_value,
        "rejected": rejected
    }
    
    return result_test_ks

In [286]:
def test_anderson_darling(data, distribution, measurements):
    ## Parameters and preparations
    N = measurements.length
    data.sort()
    
    ## Calculation S
    S = 0
    for k in range(N):
        c1 = math.log(distribution.cdf(data[k]))
        c2 = math.log(1 - distribution.cdf(data[N - k - 1]))
        c3 = (2 * (k + 1) - 1) / N
        S += c3 * (c1 + c2)
    
    ## Calculation of indicators
    A2 = -N - S
    critical_value = ad.ad_critical_value(0.95, N)
    p_value = ad.ad_p_value(N, A2)
    rejected = A2 >= critical_value
    
    ## Construction of answer
    result_test_ad = {
        "test_statistic": A2, 
        "critical_value": critical_value,
        "p-value": p_value,
        "rejected": rejected
    }
    
    return result_test_ad

In [287]:
path = "./data_1000/data_uniform.txt"
sample_distribution_file = open(path, "r")
data = [float(x.replace(",", ".")) for x in sample_distribution_file.read().splitlines()]

In [288]:
measurements = MEASUREMENTS(data)

In [289]:
## Calculae Histogram
num_bins = measurements.doanes_formula()
frequencies, bin_edges = numpy.histogram(data, num_bins, density=True)
central_values = [(bin_edges[i] + bin_edges[i + 1]) / 2 for i in range(len(bin_edges) - 1)]


In [290]:
columns = pandas.MultiIndex.from_product([["chi_square", "kolmogorov_smirnov", "anderson_darling"], ["test_statistic", "critical_value", "p_value", "rejected"]])
df = pandas.DataFrame(columns=columns)
df


Unnamed: 0_level_0,chi_square,chi_square,chi_square,chi_square,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,anderson_darling,anderson_darling,anderson_darling,anderson_darling
Unnamed: 0_level_1,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected


In [291]:
for distribution_class in _all_distributions:
    distribution_name = distribution_class.__name__.lower()
    print(distribution_name)

    validate_estimation = True
    sse = 0
    try:
        distribution = distribution_class(measurements)
        pdf_values = [distribution.pdf(c) for c in central_values]
        sse = numpy.sum(numpy.power(frequencies - pdf_values, 2.0))
    except:
        validate_estimation = False
    print(sse)

    if validate_estimation and not math.isnan(sse):
        
        try:
            chi2_test = test_chi_square(data, distribution, measurements)
            if numpy.isnan(chi2_test["test_statistic"]) == False and math.isinf(chi2_test["test_statistic"]) == False and chi2_test["test_statistic"] > 0:
                df.loc[distribution_name, ("chi_square", "test_statistic")] = chi2_test["test_statistic"]
                df.loc[distribution_name, ("chi_square", "critical_value")] = chi2_test["critical_value"]
                df.loc[distribution_name, ("chi_square", "p_value")] = chi2_test["p-value"]
                df.loc[distribution_name, ("chi_square", "rejected")] = chi2_test["rejected"]
        except:
            pass

        try:
            ks_test = test_kolmogorov_smirnov(data, distribution, measurements)
            if numpy.isnan(ks_test["test_statistic"]) == False and math.isinf(ks_test["test_statistic"]) == False and ks_test["test_statistic"] > 0:
                df.loc[distribution_name, ("kolmogorov_smirnov", "test_statistic")] = ks_test["test_statistic"]
                df.loc[distribution_name, ("kolmogorov_smirnov", "critical_value")] = ks_test["critical_value"]
                df.loc[distribution_name, ("kolmogorov_smirnov", "p_value")] = ks_test["p-value"]
                df.loc[distribution_name, ("kolmogorov_smirnov", "rejected")] = ks_test["rejected"]
        except:
            pass

        try:
            ad_test = test_anderson_darling(data, distribution, measurements)
            if numpy.isnan(ad_test["test_statistic"]) == False and math.isinf(ad_test["test_statistic"]) == False and ad_test["test_statistic"] > 0:
                df.loc[distribution_name, ("anderson_darling", "test_statistic")] = ad_test["test_statistic"]
                df.loc[distribution_name, ("anderson_darling", "critical_value")] = ad_test["critical_value"]
                df.loc[distribution_name, ("anderson_darling", "p_value")] = ad_test["p-value"]
                df.loc[distribution_name, ("anderson_darling", "rejected")] = ad_test["rejected"]
        except:
            pass
        
        if distribution_name in df.index:
            df.loc[distribution_name, "sse"] = sse
            df.loc[distribution_name, "parameters"] = str(distribution.parameters)

alpha
3.0729302103447764e-05


arcsine
1.9837428560801653e-05
argus
0.0001939270086836214
beta
1.3682834796754279e-06
beta_prime
nan
beta_prime_4p
nan
bradford
1.070358222107061e-06
burr
0.0001568807774201922
burr_4p
9.533669733172447e-05
cauchy
5.1303261273856086e-05
chi_square
0.0005636230139595848
chi_square_3p
2.7908289614167288e-05
dagum
1.873896147717291e-05
dagum_4p
nan
erlang
3.6498212889559444e-05
erlang_3p
3.04737917878035e-05
error_function
0.00012076955057664974
exponential
4.172031782677492e-05
exponential_2p
0.0008494946102717518
f
0.00019391577364802213
fatigue_life
2.9450746289075853e-05
folded_normal
2.8711293607913532e-05
frechet
0.00013550973928096895
f_4p
2.8052984514876085e-05
gamma
3.403443632318794e-05
gamma_3p
3.03421434723408e-05
generalized_extreme_value
0.00013499371766701037
generalized_gamma
4.003520862735667e-05
generalized_gamma_4p
0.00019370200571843343
generalized_logistic
4.597232056574045e-05
generalized_normal
1.717483917036383e-06
generalized_pareto
0.00042799808330483803
gibrat


In [292]:
distribution.parameters

{'alpha': 0.9650271100993315,
 'beta': 69.95317166123613,
 'loc': 100.02302334919959}

In [293]:
df

Unnamed: 0_level_0,chi_square,chi_square,chi_square,chi_square,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,anderson_darling,anderson_darling,anderson_darling,anderson_darling,sse,parameters
Unnamed: 0_level_1,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected,Unnamed: 13_level_1,Unnamed: 14_level_1
alpha,290.338581,15.507313,0.0,True,0.07019,0.042777,0.000099,True,13.340962,2.492436,0.000001,True,0.000031,"{'alpha': 12.625000283972085, 'loc': -753.2194..."
arcsine,179.874469,16.918978,0.0,True,0.108188,0.042777,0.0,True,34.184105,2.492436,0.000001,True,0.000020,"{'a': 50.00317844, 'b': 299.87618347}"
argus,177.105485,15.507313,0.0,True,0.104543,0.042777,0.0,True,21.346384,2.492436,0.000001,True,0.000194,"{'chi': 2.3436759169138693e-07, 'loc': 5.92639..."
bradford,5.532809,15.507313,0.699402,False,0.017499,0.042777,0.914017,False,0.528404,2.492436,0.717978,False,0.000001,"{'c': 0.20342480658762682, 'min': 50.00317844,..."
burr,,,,,0.220625,0.042777,0.0,True,,,,,0.000157,"{'A': 97.41625786483931, 'B': 558.785881033373..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
triangular,1028.082724,15.507313,0.0,True,0.140012,0.042777,0.0,True,109.399922,2.492436,0.000001,True,0.000075,"{'a': 50.00317844, 'b': 299.87618347, 'c': 163..."
t_student,30123477.623402,18.307038,0.0,True,0.9988,0.042777,0.0,True,9201.018079,2.492436,0.000001,True,0.000194,{'df': 2.000368754948072}
t_student_3p,282.906749,15.507313,0.0,True,0.064685,0.042777,0.000442,True,13.002016,2.492436,0.000001,True,0.956172,"{'df': 17002.726974809608, 'loc': 171.08629178..."
uniform,8.936,16.918978,0.443203,False,0.040571,0.042777,0.072337,False,2.019777,2.492436,0.089584,False,0.000002,"{'a': 50.004178429999996, 'b': 299.87518348000..."


In [294]:
df.sort_values(by=["sse"])

Unnamed: 0_level_0,chi_square,chi_square,chi_square,chi_square,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,anderson_darling,anderson_darling,anderson_darling,anderson_darling,sse,parameters
Unnamed: 0_level_1,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected,Unnamed: 13_level_1,Unnamed: 14_level_1
bradford,5.532809,15.507313,0.699402,False,0.017499,0.042777,0.914017,False,0.528404,2.492436,0.717978,False,0.000001,"{'c': 0.20342480658762682, 'min': 50.00317844,..."
johnson_sb,4.411476,14.06714,0.731349,False,0.020937,0.042777,0.764815,False,0.594449,2.492436,0.653223,False,0.000001,"{'xi': 49.16353525319014, 'lambda': 251.968413..."
uniform,8.936,16.918978,0.443203,False,0.040571,0.042777,0.072337,False,2.019777,2.492436,0.089584,False,0.000002,"{'a': 50.004178429999996, 'b': 299.87518348000..."
generalized_normal,,,,,0.499,0.042777,0.0,True,383.557293,2.492436,0.000001,True,0.000002,"{'beta': 312577591.4330473, 'miu': 174.9396811..."
trapezoidal,8.936072,14.06714,0.257279,False,0.04057,0.042777,0.072341,False,1.996339,2.492436,0.092276,False,0.000002,"{'a': 50.00317844, 'b': 50.00317845123947, 'c'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
generalized_pareto,,,,,1.741116,0.042777,0.0,True,,,,,0.000428,"{'c': 13.78952678634105, 'miu': 50.00317844, '..."
chi_square,11056094421070.847656,18.307038,0.0,True,0.352702,0.042777,0.0,True,2222.817488,2.492436,0.000001,True,0.000564,{'df': 171}
johnson_su,,,,,0.96498,0.042777,0.0,True,2042.321596,2.492436,0.000001,True,0.000729,"{'xi': -159.32024964322633, 'lambda': -221.094..."
exponential_2p,3926619.237933,16.918978,0.0,True,0.710748,0.042777,0.0,True,3292.081736,2.492436,0.000001,True,0.000849,"{'lambda': 0.05439362401420391, 'loc': 50.0040..."


In [295]:
dfx = df[(df[("chi_square", "rejected")] == False) | (df[("kolmogorov_smirnov", "rejected")] == False) | (df[("anderson_darling", "rejected")] == False)]
dfx = dfx.sort_values(by=["sse"])
dfx

Unnamed: 0_level_0,chi_square,chi_square,chi_square,chi_square,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,anderson_darling,anderson_darling,anderson_darling,anderson_darling,sse,parameters
Unnamed: 0_level_1,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected,test_statistic,critical_value,p_value,rejected,Unnamed: 13_level_1,Unnamed: 14_level_1
bradford,5.532809,15.507313,0.699402,False,0.017499,0.042777,0.914017,False,0.528404,2.492436,0.717978,False,1e-06,"{'c': 0.20342480658762682, 'min': 50.00317844,..."
johnson_sb,4.411476,14.06714,0.731349,False,0.020937,0.042777,0.764815,False,0.594449,2.492436,0.653223,False,1e-06,"{'xi': 49.16353525319014, 'lambda': 251.968413..."
uniform,8.936,16.918978,0.443203,False,0.040571,0.042777,0.072337,False,2.019777,2.492436,0.089584,False,2e-06,"{'a': 50.004178429999996, 'b': 299.87518348000..."
trapezoidal,8.936072,14.06714,0.257279,False,0.04057,0.042777,0.072341,False,1.996339,2.492436,0.092276,False,2e-06,"{'a': 50.00317844, 'b': 50.00317845123947, 'c'..."


In [296]:
# Convertir el DataFrame a un diccionario con la estructura deseada
df_dict = dfx.stack().groupby(level=0).apply(lambda x: x.xs(x.name).to_dict()).to_dict()

print("\nDiccionario:")
df_dict


Diccionario:


{'bradford': {'anderson_darling': {'': nan,
   'critical_value': 2.4924360636714047,
   'p_value': 0.7179782014285963,
   'rejected': False,
   'test_statistic': 0.5284044217493147},
  'chi_square': {'': nan,
   'critical_value': 15.507313055865453,
   'p_value': 0.6994024206390292,
   'rejected': False,
   'test_statistic': 5.532808671158107},
  'kolmogorov_smirnov': {'': nan,
   'critical_value': 0.042776500461245,
   'p_value': 0.9140169119008502,
   'rejected': False,
   'test_statistic': 0.017498578086039607},
  'parameters': {'': "{'c': 0.20342480658762682, 'min': 50.00317844, 'max': 299.87618347}",
   'critical_value': nan,
   'p_value': nan,
   'rejected': nan,
   'test_statistic': nan},
  'sse': {'': 1.070358222107061e-06,
   'critical_value': nan,
   'p_value': nan,
   'rejected': nan,
   'test_statistic': nan}},
 'johnson_sb': {'anderson_darling': {'': nan,
   'critical_value': 2.4924360636714047,
   'p_value': 0.6532234046933394,
   'rejected': False,
   'test_statistic': 0