In [1]:
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_fwf(
    './data/babyboom.dat.txt', 
    # colspecs=colspecs, 
    header=None, 
    names=['birth_time',
           'sex',
           'Birth weight in grams',
           'Number of minutes after midnight of each birth']
)
df.head()

Unnamed: 0,birth_time,sex,Birth weight in grams,Number of minutes after midnight of each birth
0,5,1,3837,5
1,104,1,3334,64
2,118,2,3554,78
3,155,2,3838,115
4,257,2,3625,177


### Тестирум три гипотезы
1) Вес распределен нормально  
**H0**: Birth weight in grams ~ $ N(df['Birth weight in grams'].mean(), df['Birth weight in grams'].std(ddof=1))$  
2) Вес мальчиков распределен нормально  
3) Вес девочек распределен нормально  
alpha = 0.05


In [12]:
st.ks_1samp(x=df['Birth weight in grams'], 
            cdf=st.norm(df['Birth weight in grams'].mean(), df['Birth weight in grams'].std(ddof=1)).cdf)

KstestResult(statistic=0.18336357215784677, pvalue=0.09106523162555868, statistic_location=3278, statistic_sign=-1)

pval = 0.09 > alpha -> не отвергаем H0

In [13]:
st.ks_1samp(x=df.loc[df['sex'] == 1, 'Birth weight in grams'], 
            cdf=st.norm(df.loc[df['sex'] == 1, 'Birth weight in grams'].mean(), df.loc[df['sex'] == 1, 'Birth weight in grams'].std(ddof=1)).cdf)

KstestResult(statistic=0.21427812399671214, pvalue=0.3315835424835907, statistic_location=3208, statistic_sign=-1)

pval = 0.33 > alpha -> не отвергаем H0

In [15]:
st.ks_1samp(x=df.loc[df['sex'] == 2, 'Birth weight in grams'], 
            cdf=st.norm(df.loc[df['sex'] == 2, 'Birth weight in grams'].mean(), df.loc[df['sex'] == 2, 'Birth weight in grams'].std(ddof=1)).cdf)

KstestResult(statistic=0.15544307065675722, pvalue=0.5070763720555325, statistic_location=3294, statistic_sign=-1)

pval = 0.50 > alpha -> не отвергаем H0

### Во всех трех случаях не отвергаем нулевую гипотезу о том что выборки из нормального

## Ди для параметров нормальных распределний

### Параметры распределния веса

In [62]:
def find_mean_ci(val, alpha):
    return st.norm.interval(1 - alpha, loc=val.mean(), scale=val.std(ddof=1)/np.sqrt(len(df)))
def find_std_ci(val, alpha):
    var = np.var(val, ddof=1)
    ci_std = st.chi2.interval(1-alpha, df=len(val)-1)
    ci_std = [np.sqrt((len(val)-1)*var/ci_std[1]), np.sqrt((len(val)-1)*var/ci_std[0])]
    return ci_std

In [63]:
print(find_mean_ci(df['Birth weight in grams'], 0.05))
print(find_std_ci(df['Birth weight in grams'], 0.05))

(3119.9337914896673, 3431.9752994194237)
[436.27247886430507, 669.0306102925874]


### Параметры распределния веса мальчиков

In [64]:
print(find_mean_ci(df.loc[df['sex'] == 1, 'Birth weight in grams'], 0.05))
print(find_std_ci(df.loc[df['sex'] == 1, 'Birth weight in grams'], 0.05))

(2945.827160708499, 3319.0617281803898)
[473.93169852283967, 946.8331595131432]


### Параметры распределния веса девочек

In [65]:
print(find_mean_ci(df.loc[df['sex'] == 2, 'Birth weight in grams'], 0.05))
print(find_std_ci(df.loc[df['sex'] == 2, 'Birth weight in grams'], 0.05))

(3248.8304909102326, 3501.784893705152)
[335.69825689250627, 590.878537222306]


Подчиняется ли распр пуассона

In [None]:
df['']

### Задание 2

In [98]:
df = pd.read_csv('./data/euroweight.dat.txt', sep='\t', header=None, names=['id', 'weight', 'batch'])
df.head()

Unnamed: 0,id,weight,batch
0,1,7.512,1
1,2,7.502,1
2,3,7.461,1
3,4,7.562,1
4,5,7.528,1


In [100]:
st.ks_1samp(x=df['weight'], 
            cdf=st.norm(df['weight'].mean(), 
                        df['weight'].std(ddof=1)).cdf)

KstestResult(statistic=0.023353803446905852, pvalue=0.22192512324841585, statistic_location=7.526, statistic_sign=1)

In [101]:
print(find_mean_ci(df['weight'], 0.05))
print(find_std_ci(df['weight'], 0.05))

(7.519725795354421, 7.522739204645579)
[0.03334580812823817, 0.03547904750143223]


In [102]:
for batch in df['batch'].unique():
    test = st.ks_1samp(x=df.loc[df['batch'] == batch, 'weight'], 
            cdf=st.norm(df.loc[df['batch'] == batch, 'weight'].mean(), 
                        df.loc[df['batch'] == batch,'weight'].std(ddof=1)).cdf)
    print(test)

KstestResult(statistic=0.038472438278823295, pvalue=0.8389620125716332, statistic_location=7.506, statistic_sign=1)
KstestResult(statistic=0.03265930919143112, pvalue=0.9444808004928708, statistic_location=7.531, statistic_sign=1)
KstestResult(statistic=0.07792267647627489, pvalue=0.0910506298988909, statistic_location=7.473, statistic_sign=-1)
KstestResult(statistic=0.04544943866051099, pvalue=0.6628215726419289, statistic_location=7.544, statistic_sign=-1)
KstestResult(statistic=0.03499095797591667, pvalue=0.9087939122045151, statistic_location=7.55, statistic_sign=-1)
KstestResult(statistic=0.055099573639602606, pvalue=0.4186860218860704, statistic_location=7.493, statistic_sign=1)
KstestResult(statistic=0.042332335662710996, pvalue=0.7449928480338537, statistic_location=7.547, statistic_sign=-1)
KstestResult(statistic=0.0701373917985999, pvalue=0.16294891063512817, statistic_location=7.525, statistic_sign=1)


### Везде не отвергаем H0

### Задание 3

In [108]:
df = pd.read_csv('./data/iris.txt', header=None, names=['sepal length',
                                                        'sepal width',
                                                        'petal length',
                                                        'petal width',
                                                        'class'])
df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [115]:
for iris_type in df['class'].unique():
    print(st.ks_1samp(x=df.loc[df['class'] == iris_type, 'petal length'], 
                cdf=st.norm(df.loc[df['class'] == iris_type, 'petal length'].mean(), 
                            df.loc[df['class'] == iris_type, 'petal length'].std(ddof=1)).cdf))
    print(find_mean_ci(df.loc[df['class'] == iris_type, 'petal length'], 0.05))
    print(find_std_ci(df.loc[df['class'] == iris_type, 'petal length'], 0.05))

KstestResult(statistic=0.15781771984946413, pvalue=0.1486168768935111, statistic_location=1.5, statistic_sign=1)
(1.4362329416227873, 1.4917670583772127)
[0.1449397726456133, 0.2162180441236967]
KstestResult(statistic=0.11712123583515, pvalue=0.4642415222366043, statistic_location=4.4, statistic_sign=-1)
(4.184799963417372, 4.335200036582628)
[0.3925326211066252, 0.5855717450167487]
KstestResult(statistic=0.11360588567240326, pvalue=0.5028719892334554, statistic_location=5.1, statistic_sign=1)
(5.4636800770489415, 5.640319922951059)
[0.4610164093979061, 0.6877343915301193]


везде не отвергаем HO