In [None]:
# Вычисления
import numpy as np
import pandas as pd
import scipy
from sklearn.mixture import GaussianMixture
# Визуализация
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
%matplotlib notebook

In [None]:
import lab1

## Выбираем признак

In [None]:
df = pd.read_csv('data/raifhack_train.csv', usecols=lambda x: x not in ['floor'])

In [None]:
df_sample = df.sample(150, random_state=0)
var = df_sample['per_square_meter_price']

**Преобразования**

In [None]:
var = var.apply(np.log)

In [None]:
var = lab1.cut_tails(var)

In [None]:
# Определяем количество бинов
nbins = lab1.get_nbins(var)
nbins

## Описательные статистики

In [None]:
fig = px.histogram(var, 
                   marginal='box', 
                   template='plotly_white')
fig.layout.showlegend=False
fig.show()

In [None]:
sns.set_style('whitegrid')
plot = sns.histplot(var, kde=True, bins=nbins, fill=False)
plot.lines[0].set_color('black')
plot

In [None]:
{i:round(j,5) 
 for i,j in var.describe().to_dict().items()}

In [None]:
{i:round(j,5)
 for i,j in lab1.get_main_quantiles(var).items()}

## Подбор параметров

In [None]:
# Теоретическое распределение
theory_distr = scipy.stats.gamma

In [None]:
params = theory_distr.fit(var)
params = np.array(params)

# params = np.array([
#     var.mean(),
#     var.std()
# ])

params

**Метод наименьших квадратов**

In [None]:
ls_result = lab1.least_squares_method(var, theory_distr, x0=params*0.9)

In [None]:
print(ls_result)

In [None]:
ls_result.x

**Метод максимального правдоподобия**

In [None]:
ml_result = lab1.max_likelihood_method(var, theory_distr, x0=params*0.9)

In [None]:
print(ml_result)

In [None]:
ml_result.x

In [None]:
# Готовое распределение
theory_distr_with_params = theory_distr(*params)

## Проверка подобранного распределения

**QQ Plot**

In [None]:
lab1.qq_plot(var, theory_distr_with_params)

**Гистограмма**

In [None]:
lab1.hist_fitted_plot(var, theory_distr_with_params)

## Статистические тесты

**Тест Колмогорова-Смирнова**

In [None]:
scipy.stats.kstest(var, theory_distr_with_params.cdf)

**Тест Хи-квадрат**

In [None]:
f_obs, f_exp = lab1.chi_square_bins(var, theory_distr_with_params, nbins)
f_obs, f_exp

In [None]:
scipy.stats.chisquare(f_obs=f_obs, f_exp=f_exp)

**Тест Жака-Бера**

In [None]:
if theory_distr.name == 'norm':
    print( scipy.stats.jarque_bera(var) )

## Сэмплирование

In [None]:
# sample = lab1.accept_reject_sampling(
#     theory_distr_with_params=theory_distr_with_params,
#     x_min=1,
#     x_max=9,
#     max_pdf=1,
#     size=10_000
# )

In [None]:
sample = lab1.clt_generator(size=10_000, n_layers=100, mean=params[0], std=params[1])

In [None]:
fig = px.histogram(sample,
                   nbins=30,
                   marginal='box', 
                   template='plotly_white')
fig.layout.showlegend=False
fig.show()

**График QQ Plot для сэмпла**

In [None]:
uniform_quantiles = np.linspace(0, 1, 50+1)[1:-1]
fig = px.scatter(
    x=var.quantile(uniform_quantiles).values,
    y=np.quantile(sample, uniform_quantiles),
    template='plotly_white'
)

fig.add_trace(go.Scatter(
    x=var.quantile(uniform_quantiles).values,
    y=var.quantile(uniform_quantiles).values,
    line_color='lightgrey',
    line_dash='dash'
))

fig.data[0].marker.size = 5
fig.layout.showlegend=False

fig.update_layout(
    xaxis_title='Фактические значения', 
    yaxis_title='Теоретические значения'
)

fig.show()