In [21]:
import pandas as pd
import numpy as np
from scipy import stats
import scipy.optimize as optimize
from sympy.solvers import solve
from sympy import Symbol, log
from scipy.stats import genpareto, norm, uniform
from skgof import ks_test, cvm_test, ad_test

In [2]:
doc = pd.read_csv('max_value.csv')
doc

Unnamed: 0,all,Nitrides,Oxides,OxySulphides (complex),Sulphides
0,25.444124,6.100337,25.444124,11.997444,22.186109
1,33.86,10.65,33.86,16.71,24.08
2,31.553676,3.455058,31.553676,8.748672,28.105617
3,11032.140099,6.755921,25.485441,9.064078,11032.140099
4,57.325893,5.365266,57.325893,10.499416,40.298737
5,33.053769,5.299798,33.053769,8.627179,31.309305
6,44.92,4.99,44.92,10.36,35.04
7,45.125219,25.110689,45.125219,11.997396,27.665857
8,34.182777,8.627179,22.043353,14.318688,34.182777
9,77.28,9.69,49.9,10.83,77.28


In [18]:
def clean_data_with_MAD(col_data):
    mad = stats.median_absolute_deviation(col_data)
    _mad = np.abs(col_data - np.median(col_data)) / mad
    data = pd.Series([col_data[i] for i in range(len(_mad)) if (_mad[i] < 3)])

    return data

def get_minus_mu_data(u0, data):
    Xi = []
    for i in data:
        if i > u0:
            Xi.append(i-u0)
    return np.array(Xi)

def get_Z(data, k, alpha):
    res = []
    for x in data:
        res.append(genpareto.cdf(x, k, 0, alpha))
    return np.array(res)

In [8]:
data = sorted(doc['all'].tolist())
data = clean_data_with_MAD(data)

In [9]:
data

0     23.620000
1     25.444124
2     31.553676
3     33.053769
4     33.860000
5     34.182777
6     36.505256
7     42.230000
8     43.574590
9     44.360000
10    44.920000
11    45.125219
12    47.080000
13    51.220000
14    57.325893
15    62.620000
16    63.821745
17    63.929226
18    77.280000
19    81.174087
20    88.045831
dtype: float64

In [11]:
col_data=sorted(doc['all'].tolist())
data = clean_data_with_MAD(col_data)
data

0     23.620000
1     25.444124
2     31.553676
3     33.053769
4     33.860000
5     34.182777
6     36.505256
7     42.230000
8     43.574590
9     44.360000
10    44.920000
11    45.125219
12    47.080000
13    51.220000
14    57.325893
15    62.620000
16    63.821745
17    63.929226
18    77.280000
19    81.174087
20    88.045831
dtype: float64

In [10]:
n = len(data)
m = n - 1

In [24]:
X = get_minus_mu_data(data[0], data)
res = genpareto.fit(X, floc=0)
k, alpha = res[0], res[2]

In [26]:
Z = get_Z(X, k, alpha)
Z, len(Z)

(array([0.03886649, 0.16557544, 0.19584633, 0.21197501, 0.21840439,
        0.26419181, 0.37339449, 0.39826161, 0.41264533, 0.42283642,
        0.42655752, 0.46163409, 0.53366005, 0.6339466 , 0.71467674,
        0.73213476, 0.73367984, 0.90202992, 0.94055162, 0.99116901]), 20)

In [27]:
cvm_test(Z, uniform(0, 1))

GofResult(statistic=0.06895473215916077, pvalue=0.7636297786590774)

In [34]:
ad_test(Z, uniform(0, 1))[1]

0.8356794734405651

In [40]:
def one_round(u0, data):
    X = get_minus_mu_data(u0, data)
    res = genpareto.fit(X, floc=0)
    k, alpha = res[0], res[2]
    Z = get_Z(X, k, alpha)
    p_w = cvm_test(Z, uniform(0, 1))[1]
    p_a = ad_test(Z, uniform(0, 1))[1]
    print(p_w, p_a)
    return p_w, p_a

In [45]:
p_w_list = []
p_a_list = []
for i in range(m):
    p_w, p_a = one_round(data[i], data)
    p_w_list.append(p_w)
    p_a_list.append(p_a)

p_w_list = np.array(p_w_list)
p_a_list = np.array(p_a_list)    

0.7636297786590774 0.8356794734405651
0.6968489697601508 0.7159446643865592
0.8172828464522051 0.8782724428685935
0.7899867741039017 0.8328625720107135
0.7562135129484509 0.8265912413467977
0.6619442281656926 0.7491414715841764
0.5933585189847828 0.6243201537571534
0.5540367479289444 0.5160658577033204
0.5531252446239563 0.4058247450451735
0.1760573812218944 0.00670526587189102
0.5585163916534972 0.010309272149950255
0.5303994772094777 0.02068938701288181
0.32334785333029403 0.0
0.4407129034968411 0.017628326910053116
0.34328852912615393 0.011048810569262546
0.5354759897436578 0.0
0.7484364361905145 0.003438260650994618
0.7205894028334519 0.012015025501692422
0.6071088058392676 0.0003090038637676429
3.584285779290042e-08 3.584285779290042e-08


In [49]:
kres_list = []
for k in range(m):
#     print(p_w_list[:k+1])
    kres = -1/(k+1) *np.sum(np.log(1-p_w_list[:k+1]))
    print(kres)
    kres_list.append(kres)

1.4423559686397156
1.3179400583303864
1.445232015838589
1.4740702043265026
1.4615486614289595
1.3987146164714779
1.3274444375142096
1.2624537234269584
1.2116785222622148
1.1098761089262608
1.083306861461632
1.056020697773258
1.0048343328651792
0.9745670482475142
0.9376299462635885
0.926949451592305
0.9536029887086578
0.9714624284793003
0.9695024356692047
0.9210273156778875


In [56]:
max_value = max(kres_list)
k_cap = kres_list.index(max_value)
k_cap

3

In [54]:
3.3596052987838547*len(data)

70.55171127446096

In [67]:
x = Symbol('x')

sympy.core.mul.Mul

log(1 - 88.045831198687*x) + log(1 - 81.1740869389311*x) + log(1 - 77.28*x) + log(1 - 63.9292261535768*x) + log(1 - 63.8217449908651*x) + log(1 - 62.62*x) + log(1 - 57.3258926877592*x) + log(1 - 51.22*x) + log(1 - 47.08*x) + log(1 - 45.1252187450655*x) + log(1 - 44.92*x) + log(1 - 44.36*x) + log(1 - 43.5745902469986*x) + log(1 - 42.23*x) + log(1 - 36.505255851362*x) + log(1 - 34.1827770198136*x) + log(1 - 33.86*x) + log(1 - 33.0537685064112*x) + log(1 - 31.5536764398981*x) + log(1 - 25.4441235717141*x) + log(1 - 23.62*x) + 70.551711274461

In [None]:
solve(f(x, np.array(data)), x)