# Тест по параметрическим критериям

In [28]:
import numpy as np
import pandas as pd
from math import sqrt
from __future__ import division

import scipy
from statsmodels.stats.weightstats import _tconfint_generic
from statsmodels.stats.proportion import proportion_confint

In [29]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [30]:
# 3

mean1 = 10 / 34
mean2 = 4 / 16
meanC = (4 + 10) / (16 + 34)

t = (mean1 - mean2) / sqrt(meanC * (1 - meanC) * (1 / 34 + 1 / 16))

In [31]:
1 - scipy.stats.norm.cdf(t)

0.37293045872523534

In [32]:
# 4

X = pd.read_csv('banknotes.txt', sep='\t')
X, y = X.drop('real', axis=1), X['real']

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, train_size=150, random_state=1)

In [35]:
pred1 = LogisticRegression().fit(X_train[:, :3], y_train).predict(X_test[:, :3])
pred2 = LogisticRegression().fit(X_train[:, 3:], y_train).predict(X_test[:, 3:])

In [36]:
err1 = pred1 != y_test
err2 = pred2 != y_test
err1, err2

(array([False, False, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False,  True,
        False, False, False,  True, False, False, False, False, False,
        False, False,  True, False, False, False, False, False, False,
        False,  True,  True, False,  True, False, False, False, False,
         True,  True,  True, False, False]),
 array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False,  True, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False]))

In [37]:
proportions_diff_z_test(proportions_diff_z_stat_rel(err1, err2))

0.0032969384555543435

In [38]:
# 5

In [39]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = len(sample1)
    n2 = len(sample2)
    
    p1 = float(sum(sample1)) / n1
    p2 = float(sum(sample2)) / n2 
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = list(zip(sample1, sample2))
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

def proportions_diff_z_stat_rel(sample1, sample2):
    sample = list(zip(sample1, sample2))
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [40]:
round(proportions_diff_confint_rel(err1, err2)[0], 4)

0.0599

In [46]:
# 6

mu0 = 525
sigma = 100
n = 100

mu = 541.4

t = (mu - mu0) / sigma * n ** 0.5
1 - scipy.stats.norm.cdf(t), t

(0.05050258347410397, 1.639999999999998)

In [47]:
# 7

n = 100
mu = 541.5
t = (mu - mu0) / sigma * n ** 0.5
round(1 - scipy.stats.norm.cdf(t), 4)

0.0495