T-test using Python and Numpy:
https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f

How to Code the Student’s t-Test from Scratch in Python:
https://machinelearningmastery.com/how-to-code-the-students-t-test-from-scratch-in-python/
    
Statistical Comparison of Two Groups:
https://www.texasoft.com/tutorial-statistics-compare-2-groups.htm
    
Independent t-test example in R:
https://www.kaggle.com/kappernielsen/independent-t-test-example

In [83]:
import pandas as pd
import numpy as np
from scipy import stats
dataset = pd.read_csv('C:/Users/ShatheepR/Desktop/data.csv')

In [84]:
dataset = dataset.fillna(0)
dataset.head(3)

Unnamed: 0,URN_IDM_COMP,ATTENDED?,20191H-20202H,2019_Qtr1,2019_Qtr2,2019_Qtr3,2019_Qtr4,2020_Qtr1,2020_Qtr2,2020_Qtr3,2020_Qtr4
0,100951108049095,YES,2.154446,2.180879,22.31417,0.006,2.148446,0.0,0.0,0.131084,0.456592
1,106701107167088,YES,11.157098,6.705002,66.108255,4.009514,2.135232,2.359942,2.65241,3.3815,44.37754
2,107271106276398,NO,140.826056,3.399301,90.712748,16.654267,23.963908,19.355122,80.852759,20.779204,25.300066


In [85]:
dataset['20191H-20202H'] = dataset['2019_Qtr3'] + dataset['2019_Qtr4']

In [86]:
dataset.rename(columns = {'20191H-20202H': 'target'}, inplace = True)

In [87]:
dataset = dataset.filter(["ATTENDED?", "target"]) #, "URN_IDM_COMP"
dataset.head(3)

Unnamed: 0,ATTENDED?,target
0,YES,2.154446
1,YES,6.144746
2,NO,40.618175


In [88]:
dataset['target'].sum()

9577.831995999999

In [89]:
## Define 2 distributions
a = dataset[(dataset["ATTENDED?"] == 'YES') & (dataset["target"] >0)]["target"]
b = dataset[(dataset["ATTENDED?"] == 'NO') & (dataset["target"] >0)]["target"]
dataset = dataset[dataset["target"] >0]["target"]

In [90]:
# Check whether the distribution is Gaussian or not through Shapiro-Wilk Test
from scipy.stats import shapiro
#stat, p = shapiro(dataset)

# Example of the D'Agostino's K^2 Normality Test
from scipy.stats import normaltest
stat, p = normaltest(dataset)

print('stat={0:.3f}, p={0:.3f}' .format(stat, p))
if p > 0.05:
    print('Probably Gaussian')
else:
    print('Probably not Gaussian')

stat=748.948, p=748.948
Probably not Gaussian


In [91]:
# Example of the Anderson-Darling Normality Test
from scipy.stats import anderson
result = anderson(dataset)
print('stat={0:.3g}'.format(result.statistic))
for i in range(len(result.critical_values)):
    sl, cv = result.significance_level[i], result.critical_values[i]
    if result.statistic < cv:
        print('Probably Gaussian at the %.1f%% level' % (sl))
    else:
        print('Probably not Gaussian at the %.1f%% level' % (sl))

stat=142
Probably not Gaussian at the 15.0% level
Probably not Gaussian at the 10.0% level
Probably not Gaussian at the 5.0% level
Probably not Gaussian at the 2.5% level
Probably not Gaussian at the 1.0% level


In [92]:
# Example of the Analysis of Variance Test - ANOVA
from scipy.stats import f_oneway
stat, p = f_oneway(a, b)
print('stat={0:.3g}, p={0:.3g}'.format(stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=3.59, p=3.59
Probably the same distribution


Nonparametric Statistical Hypothesis Tests
https://www.kaggle.com/shashwatwork/guide-to-statistical-hypothesis-tests-in-python

In [93]:
# Example of the Mann-Whitney U Test
#Distribution of two data samples are equal or not.
from scipy.stats import mannwhitneyu
stat, p = mannwhitneyu(a, b)
print('stat={0:.3g}, p={0:.3g}'.format(stat, p))
if p > 0.05:
    print('Probably the same distribution')
else:
    print('Probably different distributions')

stat=1.86e+04, p=1.86e+04
Probably different distributions


In [94]:
## Calculate the Standard Deviation
#Calculate the variance to get the standard deviation
#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1
var_a = a.var(ddof=1)
var_b = b.var(ddof=1)
print("Variance of a: " + str(var_a))
print("Variance of b: " + str(var_b))

Variance of a: 3785.5725137647855
Variance of b: 3816.833601336582


In [95]:
#std deviation
td = np.sqrt((var_a/ a.shape[0]) + (var_b/ b.shape[0])) # Gives number of rows
td

6.124854497519931

In [96]:
## Calculate the t-statistics
t = (a.mean() - b.mean())/td

In [97]:
## Compare with the critical t-value
#Degrees of freedom
#N = dataset.shape[0]
df = a.shape[0] + b.shape[0] - 2
df

562

In [98]:
# calculate the critical t value
alpha = 0.05
cv = stats.t.ppf(1.0 - alpha, df)
cv

1.6475694620295673

In [99]:
#p-value after comparison with the t 
p = 1 - stats.t.cdf(abs(t),df=df)

print("t = " + str(t))
print("p = " + str(2*p))
### You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant.
## Cross Checking with the internal scipy function
# Use scipy.stats.ttest_ind.
t2, p2 = stats.ttest_ind(a,b, equal_var=False) #, equal_var=False
print("ttest_ind:            t2 = %g  p2 = %g" % (t2, p2))

t = 1.8984330749459415
p = 0.058151053012823306
ttest_ind:            t2 = 1.89843  p2 = 0.0589607


In [100]:
# interpret via critical value
if abs(t) <= cv:
    print('Accept null hypothesis that the means are equal.')
else:
    print('Reject the null hypothesis that the means are equal.')

Reject the null hypothesis that the means are equal.


In [101]:
# interpret via p-value
if p > alpha:
    print('Accept null hypothesis that the means are equal.')
else:
    print('Reject the null hypothesis that the means are equal.')

Reject the null hypothesis that the means are equal.
