In [None]:
# Вычисления
import numpy as np
import pandas as pd
import scipy
from sklearn.mixture import GaussianMixture
# Визуализация
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
%matplotlib notebook

In [None]:
import lab1

In [None]:
df = pd.read_csv('data/raifhack_train.csv', usecols=lambda x: x not in ['floor'])

## Модель смеси

In [None]:
bi_var = df['osm_subway_closest_dist'].sample(frac=0.1, random_state=0)
bi_var = bi_var.apply(np.log)

In [None]:
fig = px.histogram(bi_var, nbins=50, template='plotly_white')
fig.layout.showlegend=False
fig.show()

In [None]:
sns.set_style('whitegrid')
plot = sns.histplot(bi_var, kde=True, bins=30, fill=False)
plot.lines[0].set_color('black')
plot

**Количество компонент смотрим по BIC**

In [None]:
gm_x = bi_var.values.reshape(-1, 1)
gm_search = list(range(1, 9))
bic_scores = [
    GaussianMixture(n_components=n_components, random_state=0).fit(gm_x).bic(gm_x)
    for n_components in gm_search 
]

In [None]:
fig = px.line(x=gm_search, y=bic_scores, template='plotly_white')
fig.data[0].mode='lines+markers'
fig.update_layout(
    xaxis_title='Число компонент',
    yaxis_title='BIC score',
)
fig.show()

**Посмотрим выделенные кластеры**

In [None]:
clusters = GaussianMixture(n_components=2, random_state=0).fit_predict(gm_x)

In [None]:
px.histogram( 
    bi_var,
    nbins=50,
    color=clusters,
    template='plotly_white'
)

In [None]:
np.unique(clusters, return_counts=True)

## Выбираем признак

In [None]:
# var = bi_var[clusters==1].sample(150, random_state=0)
var = bi_var[clusters==0].sample(150, random_state=0)

**Преобразования**

In [None]:
# var = var.apply(np.log)

In [None]:
var = lab1.cut_tails(var)

In [None]:
# Определяем количество бинов
nbins = lab1.get_nbins(var)
nbins

## Описательные статистики

In [None]:
fig = px.histogram(var, 
                   marginal='box', 
                   template='plotly_white')
fig.layout.showlegend=False
fig.show()

In [None]:
{i:round(j,5) 
 for i,j in var.describe().to_dict().items()}

In [None]:
{i:round(j,5)
 for i,j in lab1.get_main_quantiles(var).items()}

## Подбор параметров

In [None]:
# Теоретическое распределение
theory_distr = scipy.stats.norm

In [None]:
params = theory_distr.fit(var)
params = np.array(params)

# params = np.array([
#     var.mean(),
#     var.std()
# ])

params

**Метод наименьших квадратов**

In [None]:
ls_result = lab1.least_squares_method(var, theory_distr, x0=params*0.9)

In [None]:
print(ls_result)

In [None]:
ls_result.x

**Метод максимального правдоподобия**

In [None]:
ml_result = lab1.max_likelihood_method(var, theory_distr, x0=params*0.9)

In [None]:
print(ml_result)

In [None]:
ml_result.x

In [None]:
# Готовое распределение
theory_distr_with_params = theory_distr(*params)

## Проверка подобранного распределения

**QQ Plot**

In [None]:
lab1.qq_plot(var, theory_distr_with_params)

**KDE Plot**

In [None]:
# lab1.kde_plot(var, theory_distr_with_params)

**Гистограмма**

In [None]:
lab1.hist_fitted_plot(var, theory_distr_with_params)

## Статистические тесты

**Тест Колмогорова-Смирнова**

In [None]:
scipy.stats.kstest(var, theory_distr_with_params.cdf)

**Тест Хи-квадрат**

In [None]:
f_obs, f_exp = lab1.chi_square_bins(var, theory_distr_with_params, nbins)
f_obs, f_exp

In [None]:
scipy.stats.chisquare(f_obs=f_obs, f_exp=f_exp)

**Тест Жака-Бера**

In [None]:
if theory_distr.name == 'norm':
    print( scipy.stats.jarque_bera(var) )

## Сэмплирование

In [None]:
# sample_red = lab1.clt_generator(size=10_000, n_layers=100, mean=params[0], std=params[1])

In [None]:
# sample_blue = lab1.clt_generator(size=10_000, n_layers=100, mean=params[0], std=params[1])

In [None]:
sample = np.concatenate([
    sample_red, sample_blue
])

In [None]:
bi_var_sample = np.concatenate([
    bi_var[clusters==1].sample(150, random_state=0),
    bi_var[clusters==0].sample(150, random_state=0)
])

In [None]:
fig = px.histogram(sample,
                   nbins=100,
                   marginal='box', 
                   template='plotly_white')
fig.layout.showlegend=False
fig.show()

**QQ Plot**

In [None]:
uniform_quantiles = np.linspace(0, 1, 50+1)[1:-1]
fig = px.scatter(
    x=np.quantile(bi_var_sample, uniform_quantiles),
    y=np.quantile(sample, uniform_quantiles),
    template='plotly_white'
)

fig.add_trace(go.Scatter(
    x=np.quantile(sample, uniform_quantiles),
    y=np.quantile(sample, uniform_quantiles),
    line_color='lightgrey',
    line_dash='dash'
))

fig.data[0].marker.size = 5
fig.layout.showlegend=False

fig.update_layout(
    xaxis_title='Фактические значения', 
    yaxis_title='Теоретические значения'
)

fig.show()