In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from scipy import stats

## 1. Comparing Two or More Population Proportions

In [4]:
updays = np.array([20,15,17,14,12])
total_days = np.array([25,26,26,25,23])

chi2stat, pval, (table, expected) = sm.stats.proportions_chisquare(updays, total_days)
chi2stat, pval

(5.106469646959223, 0.27654646724115983)

In [6]:
table

array([[20,  5],
       [15, 11],
       [17,  9],
       [14, 11],
       [12, 11]])

In [5]:
expected

array([[15.6  ,  9.4  ],
       [16.224,  9.776],
       [16.224,  9.776],
       [15.6  ,  9.4  ],
       [14.352,  8.648]])

## 2. Test of Independence

In [8]:
data = pd.DataFrame({'age_group':['18-34','35-51','52+'],
                    'canon':[30,22,8],
                    'nikon':[16,25,9],
                    'sony':[8,19,13]})

data

Unnamed: 0,age_group,canon,nikon,sony
0,18-34,30,16,8
1,35-51,22,25,19
2,52+,8,9,13


In [10]:
# H0: The camera brand and age of customer are independent of one another 
# H1: The camera brand and age of customer are not independent of one another

chi2stat, pval, dof, expected = stats.chi2_contingency(data.iloc[:,1:])

chi2stat, pval, dof, expected

(12.145454545454545,
 0.01630177772323308,
 4,
 array([[21.6, 18. , 14.4],
        [26.4, 22. , 17.6],
        [12. , 10. ,  8. ]]))

# Nonparametric Statistics

- Nonparametric statistics is a term used to describe procedures that rely on fewer assumptions about the probability distribution for a population of interest than parametric statistics do. Nonparametric statistics also can be used for data that are at the nominal or ordinal level of measurement. Finally, nonparametric statistics can be used to perform a hypothesis test to estimate the population median, which parametric statistics cannot do.

### 1. The Wilcoxon Rank-Sum Test for Two Independent Samples

- The Wilcoxon rank-sum test is a nonparametric procedure that determines whether two populations have the same probability distribution based on evidence from two independent samples.

In [3]:
s1 = np.array([24, 31,26,33,28,22,27,23])
s2 = np.array([32,32,29,25,43,35,45])

# H0: two sets of measurements are drawn from the same distribution.
# H1: 

stats.ranksums(s1,s2)

RanksumsResult(statistic=-2.1988227369598095, pvalue=0.027890529120246504)

In [4]:
g1 = np.array([743,792,798,1260,748,614,709,678,557,952,917])
g2 = np.array([947,1318,567,465,1137,1442,1229,1122,1421,1152,617,1287])

stats.ranksums(g1, g2)

RanksumsResult(statistic=-1.8463723646899908, pvalue=0.06483815699206645)

### 2. The Wilcoxon Signed-Rank Test for Two Dependent Samples
- The Wilcoxon signed-rank test is a nonparametric procedure that can be used to compare two dependent samples. The populations from which they were drawn need not be normally distributed.

In [6]:
end_asile = np.array([64,54,126,97,37,74,117,90,81])
middle_asile = np.array([72,41,100,62,40,60,122,62,78])

stats.wilcoxon(end_asile, middle_asile)

WilcoxonResult(statistic=8.5, pvalue=0.12890625)

### 3. The Kruskal–Wallis One-Way ANOVA

- The Kruskal–Wallis test is a nonparametric procedure that allows us to determine if there is difference in the medians of three or more populations.

In [7]:
s1 = np.array([157, 76,215,235,349])
s2 = np.array([798,296,371,396,129,234])
s3 = np.array([129,571,129,316,253,543,115])

# H0: the population median of all of the groups are equal

stats.kruskal(s1,s2,s3)

KruskalResult(statistic=1.935642733777443, pvalue=0.3799098212636963)

### 4. The Spearman Rank-Order Correlation Coefficient
- The Spearman rank-order correlation coefficient, rS, measures the strength and direction of the relationship between two variables but only requires the data be at the ordinal level of measurement.

In [8]:
study_hours = np.array([3,6,4,4,3,2])
exam_grade = np.array([86,95,92,83,78,79])

stats.spearmanr(study_hours, exam_grade)

SpearmanrResult(correlation=0.7650368522374495, pvalue=0.0763256359782481)