In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

#Keeping the random data the same over multiple runs of the code.
np.random.seed(42)

#Storing our randomly generated data and labels.
data = []
groups = []
labels_var = []
labels_size = []

#The values we use for the standard deviations and the sample sizes.
variability = [5, 20, 50]
sizes = [10, 100, 10000]

#Generating data for each group for each combination of variability and sample size.
#Extend appends the contents of a list (rather than a single item, which append does)
for var in variability:
    for size in sizes:
        data.extend(np.random.normal(20,var,size))
        data.extend(np.random.normal(35,var,size))
        labels_var.extend([var]*size*2)
        labels_size.extend([size]*size*2)
        groups.extend(['group1']*size)
        groups.extend(['group2']*size)

In [2]:
#Putting the data together in a data frame and checking it.

data = pd.DataFrame({'data': data, 
                     'groups' : groups,
                    'variability':labels_var,
                    'size':labels_size})

print(data.head())
print(data.tail())


        data  groups  variability  size
0  22.483571  group1            5    10
1  19.308678  group1            5    10
2  23.238443  group1            5    10
3  27.615149  group1            5    10
4  18.829233  group1            5    10
            data  groups  variability   size
60655  21.543283  group2           50  10000
60656 -22.251146  group2           50  10000
60657  78.923211  group2           50  10000
60658  34.827867  group2           50  10000
60659   4.355907  group2           50  10000


What does set_index do? What does xs do? What does stats.ttest_ind do? What does tval,pval= put results?

xs ==> Cross section. Only for getting (not setting values). Args: Key, axis (0 assumed), level  axis 0 = rows, axis 1 = columns. Level refers to keys used if a multilevel index

ttest_ind ==> From scipy.stats Returns t-statistic and p-value given two arrays (a, b). Equal_var=True says to perfrom a standard independent 2-sample test

In [3]:
# Setting the three non-data columns to work as multi-indices. 
# This makes it much easier to get subsections of stacked data.
data_test = data.set_index(['groups','size','variability'])

print(data_test.head())

# Storing our t-values and p-values (we'll get to p-values in a sec).
tvalues=[]
pvalues=[]

#For each combination of sample size and variability, compare the two groups using a t-test
for size in sizes:
    for var in variability:
        a = data_test['data'].xs(('group1',size,var),level=('groups','size','variability'))
        b = data_test['data'].xs(('group2',size,var),level=('groups','size','variability'))
        tval,pval=stats.ttest_ind(b, a,equal_var=True)
        tvalues.append(tval)
        pvalues.append(pval)

                              data
groups size variability           
group1 10   5            22.483571
            5            19.308678
            5            23.238443
            5            27.615149
            5            18.829233




    1. y1¯=5, y2¯=8, s1=1, s2=3, N1=200, N2=500
    2. y1¯=1090, y2¯=999, s1=400, s2=30, N1=900, N2=100
    3. y1¯=45, y2¯=40, s1=45, s2=40, N1=2000, N2=2000

Answers:
    1 = -19.7814
    2 = 6.6585
    3 = 3.7139 

1. 5 - 8 / (sqrt(1/200 + 9/500)) = -19.78

In [None]:
2. 1090 - 999 / (sqrt(400^2/900 + 30^2/100)) = 91/13.6667 = 6.6585

In [6]:
datas1_1=[]
datas1_2=[]

datas1_1=np.random.normal(5,1,200)
datas1_2=np.random.normal(8,3,500)

tvals1,pvals1=stats.ttest_ind(datas1_1, datas1_2, equal_var=False)

print(tvals1)

-19.923227475186202


In [7]:
datas2_1=[]
datas2_2=[]

datas2_1=np.random.normal(1090,400,900)
datas2_2=np.random.normal(999,30,100)

tvals2,pvals2=stats.ttest_ind(datas2_1, datas2_2, equal_var=False)

print(tvals2)

6.845018927703528


In [13]:
datas3_1=[]
datas3_2=[]

datas3_1=np.random.normal(45,45,2000)
datas3_2=np.random.normal(40,40,2000)

tvals3,pvals3=stats.ttest_ind(datas3_1, datas3_2)

print(tvals3)

4.022283395246554
