In [91]:
import numpy as np
import pandas as pd
import scipy.stats
from statsmodels.stats.weightstats import *

In [14]:
def proportions_diff_z_stat_ind(a, n1, b, n2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    
    p1 = float(a) / n1
    p2 = float(b) / n2
    
    P = float(p1*n1 + p2*n2) / (n1 + n2)
    
    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [12]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    if alternative == 'two-sided':
        return 2 * (1 - scipy.stats.norm.cdf(np.abs(z_stat)))
    
    if alternative == 'less':
        return scipy.stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - scipy.stats.norm.cdf(z_stat)

In [65]:
# Q3 - 0.3729

In [16]:
proportions_diff_z_test(proportions_diff_z_stat_ind(10,34,4,16), 'greater')

0.37293045872523534

In [100]:
data = pd.read_csv('banknotes.txt',sep='\t')

In [101]:
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,real
0,214.8,131.0,131.1,9.0,9.7,141.0,1
1,214.6,129.7,129.7,8.1,9.5,141.7,1
2,214.8,129.7,129.7,8.7,9.6,142.2,1
3,214.8,129.7,129.6,7.5,10.4,142.0,1
4,215.0,129.6,129.7,10.4,7.7,141.8,1


In [102]:
data.shape

(200, 7)

In [103]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

In [104]:
feature_columns = ['X1','X2','X3','X4','X5','X6']

In [105]:
train_X, test_X, train_y, test_y = cross_validation.train_test_split(
    data[feature_columns], data.real, random_state=1)

In [120]:
clf1 = LogisticRegression().fit(train_X[['X1','X2','X3']],train_y)
predicts1 = clf1.predict(test_X[['X1','X2','X3']])
clf2 = LogisticRegression().fit(train_X[['X4','X5','X6']],train_y)
predicts2 = clf2.predict(test_X[['X4','X5','X6']])

In [121]:
sample1 = (predicts1 != test_y)
sample2 = (predicts2 != test_y)

In [122]:
def proportions_diff_confint_rel(sample1, sample2, alpha = 0.05):
    z = scipy.stats.norm.ppf(1 - alpha / 2.)
    sample = zip(sample1, sample2)
    n = len(sample)
        
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    left_boundary = float(f - g) / n  - z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    right_boundary = float(f - g) / n  + z * np.sqrt(float((f + g)) / n**2 - float((f - g)**2) / n**3)
    return (left_boundary, right_boundary)

In [123]:
def proportions_diff_z_stat_rel(sample1, sample2):
    sample = zip(sample1, sample2)
    n = len(sample)
    
    f = sum([1 if (x[0] == 1 and x[1] == 0) else 0 for x in sample])
    g = sum([1 if (x[0] == 0 and x[1] == 1) else 0 for x in sample])
    
    return float(f - g) / np.sqrt(f + g - float((f - g)**2) / n )

In [124]:
print "p-value: {}".format(proportions_diff_z_test(proportions_diff_z_stat_rel(sample1, sample2)))

p-value: 0.00329693845555


In [125]:
# Q4 - 3

In [129]:
conf = proportions_diff_confint_rel(sample1,sample2)
print("95%% confidence interval for a difference between proportions: [{:.4f}, {:.4f}]".format(conf[0], conf[1]))

95%% confidence interval for a difference between proportions: [0.0599, 0.3001]


In [127]:
# Q5 - 0.0599

In [55]:
def calc_Z(x, nu, st, n):
    return float(x - nu)/(float(st)/np.sqrt(n))

In [57]:
Z = calc_Z(541.4, 525, 100, 100)

In [63]:
proportions_diff_z_test(Z,'greater')

0.050502583474103968

In [68]:
# Q6 - 0.0505

In [64]:
proportions_diff_z_test(calc_Z(541.5, 525, 100, 100),'greater')

0.049471468033648103

In [None]:
#Q7 - 0.0495