$\newcommand{\Sum}{\sum\limits}
\newcommand{\Int}{\int\limits}
\newcommand{\Intf}{\int\limits_{-\infty}^{+\infty}}
\newcommand{\Prod}{\prod\limits}
\newcommand{\Max}{\max\limits}
\newcommand{\Min}{\min\limits}
\newcommand{\Lim}{\lim\limits}
\newcommand{\Var}{\mathbb{V}}
\newcommand{\Exp}{\mathbb{E}}
\newcommand{\argmax}{\arg\max}
\newcommand{\Cov}{\mathrm{Cov}}
\newcommand{\makebold}[1]{\boldsymbol{#1}}
\newcommand{\mean}[1]{\overline{#1}}
\newcommand{\Prob}{\mathcal{P}}
\newcommand{\lp}{\left}
\newcommand{\rp}{\right}
\newcommand{\boldX}{\boldsymbol{X}}
\newcommand{\boldY}{\boldsymbol{Y}}
\newcommand{\boldZ}{\boldsymbol{Z}}
\newcommand{\Poisson}{\mathrm{Poisson}}
\newcommand{\Uniform}{\mathrm{Uniform}}
\newcommand{\Binomial}{\mathrm{Binomial}}
\newcommand{\Gammap}{\mathrm{Gamma}}
\newcommand{\Normal}{\mathcal{N}}
\newcommand{\LogN}{\mathrm{LogN}}
\newcommand{\Exponential}{\mathrm{Exp}}
\newcommand{\Erlang}{\mathrm{Erlang}}
\newcommand{\Cauchy}{C}$

## Математическая статистика в машинном обучении

## Д/З №2

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy
from scipy import stats

## Задача 2: параметрический бутстреп для $\psi = p_1 - p_2$

In [2]:
n1 = n2 = 200
X1 = 160
X2 = 148
alpha = 0.1

# MLE оценки параметров распределений Bin(n1,p1) и Bin(n2,p2)
p1_est = X1 / n1
p2_est = X2 / n2

# Псевдовыборки из Bin(n1,p1_est) и Bin(n2,p2_est)
B = 1000
statistics = []
for i in range(B):
    p1_bootstrap = scipy.stats.binom.rvs(n1, p1_est, size=1)[0] / n1
    p2_bootstrap = scipy.stats.binom.rvs(n2, p2_est, size=1)[0] / n2
    statistics.append(p1_bootstrap - p2_bootstrap)
psi_boot = np.mean(statistics)
se_boot = np.sqrt(((np.array(statistics) - psi_boot)**2).sum() / B)
z = abs(stats.norm().ppf(alpha / 2))

print(f'Бутстреп оценка psi: {psi_boot:.4f}')
print(f'Бутстреп оценка se: {se_boot:.4f}')
print(f'Квантиль норм. распр. для alpha={alpha}: {z:.4f}')
print(f'{int(100*(1-alpha))}%-доверительный интервал: ({psi_boot - z*se_boot:.4f}, \
{psi_boot + z*se_boot:.4f})')

Бутстреп оценка psi: 0.0608
Бутстреп оценка se: 0.0422
Квантиль норм. распр. для alpha=0.1: 1.6449
90%-доверительный интервал: (-0.0086, 0.1302)


## Задача 10: цифры числа $\pi$

In [3]:
n_digits = 10
digits_occurencies = [968, 1026,1021, 974, 1014, 1046, 1021, 970, 948, 1014]
N = sum(digits_occurencies)
expected_occurencies = [N/n_digits for _ in range(n_digits)]

In [4]:
from scipy.stats import chisquare

chisquare(digits_occurencies, f_exp=expected_occurencies)

Power_divergenceResult(statistic=9.367726454709057, pvalue=0.40404520751503087)

## Задача 11: проверка автора по 4-буквенным словам в статьях

In [5]:
alpha = 0.05
X = np.array([.224, .261, .216, .239, .229, .228, .234, .216])
Y = np.array([.207, .204, .195, .209, .201, .206, .223, .222, .219, .200])

theta = 0. # H0: различий нет
theta_est = X.mean() - Y.mean()
se_est = np.sqrt(np.std(X, ddof=0)**2/X.size + np.std(Y, ddof=0)**2/Y.size)
wald = abs((theta_est - theta) / se_est)
z_alpha = scipy.stats.norm.ppf(1 - alpha / 2)
conf_interval = (theta_est - z_alpha * se_est, theta_est + z_alpha * se_est)
p_value = 2 * (1 - scipy.stats.norm.cdf(wald))

print(f'p-value: {p_value:.6f}')
print(f'Точечная оценка разности средних: {theta_est:.3f}')
print(f'95%-доверительный интервал: ({conf_interval[0]:.3f}, {conf_interval[1]:.3f})')

p-value: 0.000075
Точечная оценка разности средних: 0.022
95%-доверительный интервал: (0.011, 0.033)


In [6]:
permutations = 100_000
XY = np.hstack([X, Y])
m = X.size
differences = []

for _ in range(permutations):
    np.random.shuffle(XY)
    X_permuted, Y_permuted = XY[:m], XY[m:]
    T = abs(X_permuted.mean() - Y_permuted.mean())
    differences.append(T > theta_est)
        
p_value = np.mean(differences)
print(f'p-value: {p_value:.5f}')

p-value: 0.00084


## Задача 12: проверка частот поездов

In [7]:
trains = pd.read_csv('trains.csv')
to_B = trains.loc[trains['train_to_B'] == 1].shape[0]
to_C = trains.loc[trains['train_to_B'] == 0].shape[0]
p_0 = 0.5
total = to_B + to_C
half = p_0 * total

print(f'to_B = {to_B}, to_C = {to_C}')
print(f'p_0 = {p_0}, p_B = {to_B/total:.4f}, p_C = {to_C/total:.4f}')

alpha = 0.05
lambda_ = 2 * (to_B * np.log(to_B / total) + to_C * np.log(to_C / total) - total * np.log(p_0))
chi2_value = scipy.stats.chi2.ppf(1 - alpha, 1)
p_value = 1 - scipy.stats.chi2.cdf(lambda_, 1)

print(f'T_lambda = {lambda_:.4f}')
print(f'95% chi2 = {chi2_value:.4f}')
print(f'p-value = {p_value:.4f}')

to_B = 34, to_C = 12
p_0 = 0.5, p_B = 0.7391, p_C = 0.2609
T_lambda = 10.9648
95% chi2 = 3.8415
p-value = 0.0009


In [8]:
theta = 0.5
theta_est = to_B / total
se_est = np.sqrt(theta_est * (1 - theta_est) / total)
wald = abs((theta_est - theta) / se_est)
z_alpha = scipy.stats.norm.ppf(1 - alpha / 2)
conf_interval = (theta_est - z_alpha * se_est, theta_est + z_alpha * se_est)
p_value = 2 * (1 - scipy.stats.norm.cdf(wald))

print(f'p-value: {p_value:.6f}')
print(f'Точечная оценка разности средних: {theta_est:.3f}')
print(f'95%-доверительный интервал: ({conf_interval[0]:.3f}, {conf_interval[1]:.3f})')

p-value: 0.000221
Точечная оценка разности средних: 0.739
95%-доверительный интервал: (0.612, 0.866)
