## Shapiro-Wilk Test

### Tests whether a data sample has a Gaussian distribution.
#### H0: the sample has a Gaussian distribution.
#### H1: the sample does not have a Gaussian distribution.

In [4]:
from scipy.stats import shapiro

In [5]:
from sklearn import datasets
import pandas as pd

In [6]:
# load iris dataset
iris = datasets.load_iris()
# Since this is a bunch, create a dataframe
iris_df=pd.DataFrame(iris.data)
iris_df['class']=iris.target

iris_df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
iris_df.dropna(how="all", inplace=True) # remove any empty lines


In [7]:
iris_df.describe(include='all')

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,class
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [91]:
def shapiro_test_sig(pval):
    if pval > 0.05:
         return 'Probably Gaussian'
    else:
         return 'Probably not Gaussian'
        
df=pd.DataFrame()        
df['p_val']=iris_df.apply(lambda x:round(shapiro(x)[1],3), axis=0)
df['sig']=df['p_val'].apply(lambda x:shapiro_test_sig(x))

df

Unnamed: 0,p_val,sig
sepal_len,0.01,Probably not Gaussian
sepal_wid,0.101,Probably Gaussian
petal_len,0.0,Probably not Gaussian
petal_wid,0.0,Probably not Gaussian
class,0.0,Probably not Gaussian


### 2. Pearson’s Correlation Coefficient

### Tests whether two samples have a linear relationship.
#### H0: the two samples are independent.
#### H1: there is a dependency between the samples.

In [35]:
from scipy.stats import pearsonr

In [72]:
#temp=[col1,col2,p_val,sig]
df=pd.DataFrame([temp],columns=['col1','col2','p_val','sig'])
df

Unnamed: 0,col1,col2,p_val,sig
0,petal_len,petal_wid,0.0,Probably dependent


In [98]:
line="_".join(["_" for i in range(30)])
num_cols=len(iris_df.columns)

def pearson_test_sig(pval):
    if pval > 0.05:
         return 'Probably independent'
    else:
         return 'Probably dependent'

df=pd.DataFrame()
for ind,col1 in enumerate(iris_df.columns):
    #print(line)
    for col2 in iris_df.columns[ind+1:num_cols]:
        p_val=round(pearsonr(iris_df[col1], iris_df[col2])[1],3)
        sig=pearson_test_sig(p_val)
        #print(f"{col1},{col2}, p_val={p_val}, sig= {sig}")
        temp=pd.DataFrame([[col1,col2,p_val,sig]],columns=['col1','col2','p_val','sig'])
        df=pd.concat([temp,df],axis=0)
df

Unnamed: 0,col1,col2,p_val,sig
0,petal_wid,class,0.0,Probably dependent
0,petal_len,class,0.0,Probably dependent
0,petal_len,petal_wid,0.0,Probably dependent
0,sepal_wid,class,0.0,Probably dependent
0,sepal_wid,petal_wid,0.0,Probably dependent
0,sepal_wid,petal_len,0.0,Probably dependent
0,sepal_len,class,0.0,Probably dependent
0,sepal_len,petal_wid,0.0,Probably dependent
0,sepal_len,petal_len,0.0,Probably dependent
0,sepal_len,sepal_wid,0.152,Probably independent


### Other Correlation tests

#### Spearman’s Rank Correlation

### Tests whether two samples have a monotonic relationship.

#### H0: the two samples are independent.
#### H1: there is a dependency between the samples

#### Chi-Squared Test

### Tests whether two categorical variables are related or independent.

#### H0: the two samples are independent.
#### H1: there is a dependency between the samples.

In [99]:
from scipy.stats import chi2_contingency


line="_".join(["_" for i in range(30)])
num_cols=len(iris_df.columns)

def pearson_test_sig(pval):
    if pval > 0.05:
         return 'Probably independent'
    else:
         return 'Probably dependent'

df=pd.DataFrame()
for ind,col1 in enumerate(iris_df.columns):
    #print(line)
    for col2 in iris_df.columns[ind+1:num_cols]:
        p_val=round(chi2_contingency(iris_df[col1], iris_df[col2])[1],3)
        sig=pearson_test_sig(p_val)
        #print(f"{col1},{col2}, p_val={p_val}, sig= {sig}")
        temp=pd.DataFrame([[col1,col2,p_val,sig]],columns=['col1','col2','p_val','sig'])
        df=pd.concat([temp,df],axis=0)
df

Unnamed: 0,col1,col2,p_val,sig
0,petal_wid,class,1.0,Probably independent
0,petal_len,class,1.0,Probably independent
0,petal_len,petal_wid,1.0,Probably independent
0,sepal_wid,class,1.0,Probably independent
0,sepal_wid,petal_wid,1.0,Probably independent
0,sepal_wid,petal_len,1.0,Probably independent
0,sepal_len,class,1.0,Probably independent
0,sepal_len,petal_wid,1.0,Probably independent
0,sepal_len,petal_len,1.0,Probably independent
0,sepal_len,sepal_wid,1.0,Probably independent


### Stationarity 

### Augmented Dickey-Fuller Unit Root Test, Tests whether a time series has a unit root, e.g. has a trend or more generally is autoregressive.

#### H0: a unit root is present (series is non-stationary).
#### H1: a unit root is not present (series is stationary).

In [100]:
# Example of the Augmented Dickey-Fuller unit root test
from statsmodels.tsa.stattools import adfuller
data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
stat, p, lags, obs, crit, t = adfuller(data)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably not Stationary')
else:
	print('Probably Stationary')
    
adfuller(data)[1]

stat=0.992, p=0.994
Probably not Stationary


0.9941824998493046

In [101]:
from statsmodels.tsa.stattools import adfuller


line="_".join(["_" for i in range(30)])
num_cols=len(iris_df.columns)

def adfuller_test_sig(pval):
    if pval > 0.05:
         return 'Probably not Stationary'
    else:
         return 'Probably Stationary'

df=pd.DataFrame()        
df['p_val']=iris_df.apply(lambda x:round(adfuller(x)[1],3), axis=0)
df['sig']=df['p_val'].apply(lambda x:adfuller_test_sig(x))

df


Unnamed: 0,p_val,sig
sepal_len,0.565,Probably not Stationary
sepal_wid,0.112,Probably not Stationary
petal_len,0.583,Probably not Stationary
petal_wid,0.659,Probably not Stationary
class,0.801,Probably not Stationary


### Student’s t-test

### Tests whether the means of two independent samples are significantly different.

#### H0: the means of the samples are equal.
#### H1: the means of the samples are unequal.

In [102]:
# Example of the Student's t-test
from scipy.stats import ttest_ind
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = ttest_ind(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

stat=-0.326, p=0.748
Probably the same distribution


### Paired T-test
#### H0: the means of the samples are equal.
#### H1: the means of the samples are unequal.

In [103]:
# Example of the Paired Student's t-test
from scipy.stats import ttest_rel
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
stat, p = ttest_rel(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
 print('Probably the same distribution')
else:
 print('Probably different distributions')

stat=-0.334, p=0.746
Probably the same distribution


### Analysis of Variance Test (ANOVA)

### Tests whether the means of two or more independent samples are significantly different.

#### H0: the means of the samples are equal.
#### H1: one or more of the means of the samples are unequal.

In [104]:
# Example of the Analysis of Variance Test
from scipy.stats import f_oneway
data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]
data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]
data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]
stat, p = f_oneway(data1, data2, data3)
print('stat=%.3f, p=%.3f' % (stat, p))
if p > 0.05:
	print('Probably the same distribution')
else:
	print('Probably different distributions')

stat=0.096, p=0.908
Probably the same distribution
