In [1]:
import numpy as np
import pandas as pd

In [5]:
import scipy.stats as stats
from scipy.stats import skew, kurtosis, mode, binom, poisson, norm, expon

# Descriptive Statistics: describe()

In [7]:
np.random.seed(42)
data = pd.DataFrame({
    'Var1': np.random.rand(10),
    'Var2': np.random.rand(10),
    'Var3': np.random.rand(10),
    'Var4': np.random.rand(10),
    'Var5': np.random.rand(10),
    'c': ['X', 'X', 'Y', 'Y', 'Y', 'X', 'X', 'Y', 'Y', 'Y']
})

In [8]:
data.describe(include='all')

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,c
count,10.0,10.0,10.0,10.0,10.0,10
unique,,,,,,2
top,,,,,,Y
freq,,,,,,6
mean,0.520137,0.395268,0.400387,0.509271,0.404557,
std,0.315866,0.302371,0.23386,0.343095,0.270406,
min,0.058084,0.020584,0.04645,0.065052,0.034389,
25%,0.210649,0.190638,0.222791,0.204047,0.203336,
50%,0.599887,0.297736,0.411216,0.523849,0.403444,
75%,0.726014,0.501554,0.57287,0.777356,0.54005,


In [9]:
data.groupby(by='c')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x137c27950>

In [52]:
''' describe: Calculate descriptive statistics for each numerical column in the DataFrame.df: DataFrame
        cols: list of numerical columns to compute statistics (optional)
        id_vars: id column
    Dependencies: pandas, numpy
'''
def describe(df: pd.DataFrame, cols=[], group=''):

    if group!='': df = df.pivot(columns=group) # Splits tidy dataframe
    if len(cols)>0: df = df[cols] # Isolate specified columns

    descriptive = pd.DataFrame()    
    descriptive['mean'] = df.mean() # Mean
    descriptive['median'] = df.median() # Median
    descriptive['variance'] = df.var() # Variance
    descriptive['std_dev'] = df.std() # Standard Deviation
    descriptive['mad'] = df.apply(lambda x: np.median(np.abs(x - x.median()))) # Median Absolute Deviation
    descriptive['min'] = df.min() # Minimum
    descriptive['max'] = df.max() # Maximum
    descriptive['range'] = df.max() - df.min() # Range
    descriptive['skewness'] = df.apply(lambda x: skew(x, nan_policy='omit')) # Skewness
    descriptive['kurtosis'] = df.apply(lambda x: kurtosis(x, nan_policy='omit')) # Kurtosis
    descriptive['count'] = df.count() # Count (non-missing observations)
    descriptive['sum'] = df.sum() # Sum
    descriptive['25%'] = df.quantile(0.25)  # Quantiles (25%, 50%, 75%)
    descriptive['50%'] = df.quantile(0.50)
    descriptive['75%'] = df.quantile(0.75)

    return descriptive

In [40]:
import pyMUZ.gen.io as io

In [41]:
lc = io.get('/Users/marczepeda/Documents/Liau_Lab/Projects/2.ZF_degraders/1.Molecular_Biology/MUZ120/NGS/out/LC_outcomes.csv')
lc

Unnamed: 0,sample,edit,count,fraction
0,MUZ120-38_S38_L001_R1_001,WT,2712,0.546995
1,MUZ120-38_S38_L001_R1_001,H167R,2145,0.432634
2,MUZ120-38_S38_L001_R1_001,"H163P, H167R",4,0.000807
3,MUZ120-38_S38_L001_R1_001,"H167R, G169R",4,0.000807
4,MUZ120-38_S38_L001_R1_001,"R143Q, H167R",3,0.000605
...,...,...,...,...
1563,MUZ120-34_S34_L001_R1_001,G169W,1,0.000232
1564,MUZ120-34_S34_L001_R1_001,"N159T, H191Y",1,0.000232
1565,MUZ120-34_S34_L001_R1_001,N179I,1,0.000232
1566,MUZ120-34_S34_L001_R1_001,"F145L, N159T",1,0.000232


In [48]:
describe(lc,cols=['count','fraction'])

Unnamed: 0,mean,median,variance,std_dev,mad,min,max,range,skewness,kurtosis,count,sum,25%,50%,75%
count,59.283163,1.0,173362.998261,416.368825,0.0,1.0,5590.0,5589.0,8.880312,88.816906,1568,92956.0,1.0,1.0,1.0
fraction,0.017219,0.000247,0.012536,0.111963,8.4e-05,0.000163,0.982928,0.982764,7.215052,53.05061,1568,27.0,0.000193,0.000247,0.000463


In [49]:
desc = describe(lc,cols=['fraction'],group='sample')

In [51]:
desc.loc['fraction']['25%']

sample
MUZ120-31_S31_L001_R1_001    0.000237
MUZ120-32_S32_L001_R1_001    0.000456
MUZ120-33_S33_L001_R1_001    0.000461
MUZ120-34_S34_L001_R1_001    0.000232
MUZ120-35_S35_L001_R1_001    0.000209
MUZ120-36_S36_L001_R1_001    0.000427
MUZ120-37_S37_L001_R1_001    0.000198
MUZ120-38_S38_L001_R1_001    0.000202
MUZ120-39_S39_L001_R1_001    0.000219
MUZ120-40_S40_L001_R1_001    0.000163
MUZ120-41_S41_L001_R1_001    0.000175
MUZ120-42_S42_L001_R1_001    0.000193
MUZ120-43_S43_L001_R1_001    0.000456
MUZ120-44_S44_L001_R1_001    0.000523
MUZ120-45_S45_L001_R1_001    0.000759
MUZ120-46_S46_L001_R1_001    0.000571
MUZ120-47_S47_L001_R1_001    0.000646
MUZ120-48_S48_L001_R1_001    0.000883
MUZ120-49_S49_L001_R1_001    0.000247
MUZ120-50_S50_L001_R1_001    0.000513
MUZ120-51_S51_L001_R1_001    0.000414
MUZ120-52_S52_L001_R1_001    0.000452
MUZ120-53_S53_L001_R1_001    0.000492
MUZ120-54_S54_L001_R1_001    0.000463
MUZ120-55_S55_L001_R1_001    0.000169
MUZ120-56_S56_L001_R1_001    0.000183
MUZ12

In [45]:
describe(lc,cols=['fraction'],group='sample')

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,variance,std_dev,mad,min,max,range,skewness,kurtosis,count,sum,25%,50%,75%
Unnamed: 0_level_1,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
fraction,MUZ120-31_S31_L001_R1_001,0.012987,0.000237,0.006201,0.078743,,0.000237,0.581771,0.581533,6.344116,39.681758,77,1.0,0.000237,0.000237,0.000237
fraction,MUZ120-32_S32_L001_R1_001,0.021739,0.000456,0.020638,0.143659,,0.000456,0.974897,0.974441,6.559096,41.021911,46,1.0,0.000456,0.000456,0.000456
fraction,MUZ120-33_S33_L001_R1_001,0.023256,0.000461,0.013526,0.1163,,0.000461,0.726602,0.726141,5.50412,29.973192,43,1.0,0.000461,0.000461,0.000461
fraction,MUZ120-34_S34_L001_R1_001,0.015385,0.000232,0.008714,0.093351,,0.000232,0.703223,0.702991,6.65502,44.806615,65,1.0,0.000232,0.000232,0.000232
fraction,MUZ120-35_S35_L001_R1_001,0.011628,0.000209,0.011003,0.104896,,0.000209,0.973075,0.972866,9.110785,81.008219,86,1.0,0.000209,0.000209,0.000209
fraction,MUZ120-36_S36_L001_R1_001,0.028571,0.000427,0.027576,0.16606,,0.000427,0.982928,0.982501,5.659441,30.029323,35,1.0,0.000427,0.000427,0.000427
fraction,MUZ120-37_S37_L001_R1_001,0.013333,0.000198,0.011722,0.108268,,0.000198,0.937797,0.937599,8.467692,69.803572,75,1.0,0.000198,0.000198,0.000396
fraction,MUZ120-38_S38_L001_R1_001,0.011765,0.000202,0.00565,0.075168,,0.000202,0.546995,0.546793,6.421744,39.806366,85,1.0,0.000202,0.000202,0.000202
fraction,MUZ120-39_S39_L001_R1_001,0.012346,0.000219,0.010271,0.101344,,0.000219,0.910706,0.910487,8.761694,75.149848,81,1.0,0.000219,0.000219,0.000219
fraction,MUZ120-40_S40_L001_R1_001,0.009901,0.000163,0.004572,0.067617,,0.000163,0.504248,0.504085,6.910534,45.881758,101,1.0,0.000163,0.000163,0.000163


In [44]:
describe(lc,cols=['count'],group='sample')

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median,variance,std_dev,mad,min,max,range,skewness,kurtosis,count,sum,25%,50%,75%
Unnamed: 0_level_1,sample,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
count,MUZ120-31_S31_L001_R1_001,54.714286,1.0,110055.390977,331.745974,,1.0,2451.0,2450.0,6.344116,39.681758,77,4213.0,1.0,1.0,1.0
count,MUZ120-32_S32_L001_R1_001,47.630435,1.0,99071.571498,314.756368,,1.0,2136.0,2135.0,6.559096,41.021911,46,2191.0,1.0,1.0,1.0
count,MUZ120-33_S33_L001_R1_001,50.44186,1.0,63632.062016,252.253963,,1.0,1576.0,1575.0,5.50412,29.973192,43,2169.0,1.0,1.0,1.0
count,MUZ120-34_S34_L001_R1_001,66.353846,1.0,162104.294712,402.621776,,1.0,3033.0,3032.0,6.65502,44.806615,65,4313.0,1.0,1.0,1.0
count,MUZ120-35_S35_L001_R1_001,55.709302,1.0,252564.585089,502.558042,,1.0,4662.0,4661.0,9.110785,81.008219,86,4791.0,1.0,1.0,1.0
count,MUZ120-36_S36_L001_R1_001,66.942857,1.0,151382.820168,389.079452,,1.0,2303.0,2302.0,5.659441,30.029323,35,2343.0,1.0,1.0,1.0
count,MUZ120-37_S37_L001_R1_001,67.306667,1.0,298704.512793,546.538665,,1.0,4734.0,4733.0,8.467692,69.803572,75,5048.0,1.0,1.0,2.0
count,MUZ120-38_S38_L001_R1_001,58.329412,1.0,138891.985434,372.682151,,1.0,2712.0,2711.0,6.421744,39.806366,85,4958.0,1.0,1.0,1.0
count,MUZ120-39_S39_L001_R1_001,56.271605,1.0,213374.425309,461.924697,,1.0,4151.0,4150.0,8.761694,75.149848,81,4558.0,1.0,1.0,1.0
count,MUZ120-40_S40_L001_R1_001,60.594059,1.0,171245.303564,413.817959,,1.0,3086.0,3085.0,6.910534,45.881758,101,6120.0,1.0,1.0,1.0


In [32]:
data[['Var1','Var2','Var3','Var4','Var5']].mean()

Var1    0.520137
Var2    0.395268
Var3    0.400387
Var4    0.509271
Var5    0.404557
dtype: float64

In [35]:
data[['Var1','Var2','Var3','Var4','Var5']].mode()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5
0,0.058084,0.020584,0.04645,0.065052,0.034389
1,0.155995,0.181825,0.139494,0.097672,0.122038
2,0.156019,0.183405,0.199674,0.170524,0.184854
3,0.37454,0.212339,0.292145,0.304614,0.25878
4,0.598658,0.291229,0.366362,0.440152,0.311711
5,0.601115,0.304242,0.45607,0.607545,0.495177
6,0.708073,0.431945,0.514234,0.684233,0.520068
7,0.731994,0.524756,0.592415,0.808397,0.54671
8,0.866176,0.832443,0.611853,0.948886,0.662522
9,0.950714,0.96991,0.785176,0.965632,0.90932


In [39]:
describe(data,cols=['Var1','Var2','Var3','Var4','Var5'])

Unnamed: 0,mean,median,variance,std_dev,mad,min,max,range,skewness,kurtosis,count,sum,25%,50%,75%
Var1,0.520137,0.599887,0.099771,0.315866,0.245818,0.058084,0.950714,0.892631,-0.222491,-1.34747,10,5.201367,0.210649,0.599887,0.726014
Var2,0.395268,0.297736,0.091428,0.302371,0.12506,0.020584,0.96991,0.949325,0.82907,-0.475119,10,3.952678,0.190638,0.297736,0.501554
Var3,0.400387,0.411216,0.05469,0.23386,0.190918,0.04645,0.785176,0.738726,0.033596,-1.037475,10,4.003872,0.222791,0.411216,0.57287
Var4,0.509271,0.523849,0.117714,0.343095,0.318937,0.065052,0.965632,0.90058,0.022949,-1.464473,10,5.092707,0.204047,0.523849,0.777356
Var5,0.404557,0.403444,0.07312,0.270406,0.181627,0.034389,0.90932,0.874932,0.374111,-0.737048,10,4.04557,0.203336,0.403444,0.54005


In [22]:
data.pivot?

[0;31mSignature:[0m
[0mdata[0m[0;34m.[0m[0mpivot[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m=[0m[0mtyping[0m[0;34m.[0m[0mLiteral[0m[0;34m[[0m[0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalues[0m[0;34m=[0m[0mtyping[0m[0;34m.[0m[0mLiteral[0m[0;34m[[0m[0;34m<[0m[0mno_default[0m[0;34m>[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'DataFrame'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return reshaped DataFrame organized by given index / column values.

Reshape data (produce a "pivot" table) based on column values. Uses
unique values from specified `index` / `columns` to form axes of the
resulting DataFrame. This function does not support data
aggregation, multiple values will result in a MultiIndex in the
columns. See the :ref:`User Guide

In [6]:
data.groupby?

[0;31mSignature:[0m
[0mdata[0m[0;34m.[0m[0mgroupby[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mby[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis'[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevel[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mas_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msort[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_keys[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mobserved[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropna[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0

# Statistical Test Flow Chart: test()

In [None]:
''' test: Statistical test flow chart
        question: difference, correlation, categories
        factors: # of factors
        same: same (True) or different (False) subjects
        compare: list of comparisons
        para: parameteric (True) or nonparametric (False)
    Dependencies:
'''
def test(df: pd.DataFrame(), compare: list(), question: str(), factors=1, same=False, para=True):

    if question=='difference':
        if factors==1:
            if not same:
                if para==True:
                    if len(compare)==2:
                        print('Statistical Test: T-test \nAdvice: Only use if you have two groups or if you are comparing just two of the n groups and are not concerned about inflating the Type I error rate (e.g., False Positives).')
                        t_stat, p_value = stats.ttest_ind(df[compare[0]],df[compare[1]])
                        return pd.DataFrame({'t_stat': t_stat, 'p_value': p_value},index=pd.Index([','.join(compare)],name='compare')).reset_index()
                    elif len(compare)>2:
                        print('Statistical Test: 1-way Anova\nAdvice: If you want to compare all three groups simultaneously to determine if there is a significant difference in their means.')
                        print('Follow up with Tukey\'s Honestly Significant Difference (HSD) Test,Bonferroni Correction for t-test, Holm’s Sequential Bonferroni Procedure')
                        f_stat, p_value = stats.f_oneway(*(df[col] for col in compare))
                        return pd.DataFrame({'f_stat': f_stat, 'p_value': p_value},index=pd.Index([','.join(compare)],name='compare')).reset_index()
                    else: print('Error: Invalid compare. List needs to contain 2 or more stings')
                else:
                    print('Statistical Test: Mann Whitney U Test\nWIP...')
                    f_stat, p_value = stats.f_oneway(df[compare])
            
            else:
                if para==True:
                    if len(compare)==2:
                        print('Statistical Test: Paired T-test \nWIP...')
                        t_stat, p_value, dof = stats.ttest_ind(df[compare])
                        return pd.DataFrame({'t_stat': t_stat, 'p_value': p_value, 'dof':dof})
                    elif len(compare)>2:
                        print('Statistical Test: Repeated 1-way Anova\nWIP...')
                        f_stat, p_value = stats.f_oneway(df[compare])
                    else: print('Error: Invalid compare. List needs to contain 2 or more stings')
                else: return ''
        elif factors>1: print('Statistical Test: 2 way anova, General Linear (Mixed) Model, etc.\nNot Included...')
        else: print('Error: Invalid factors. Number needs to be greater or equal to 1.')
    
    elif question=='correlation':
        return ''
    
    elif question=='category':
        return ''
    
    else:
        print('Error: Invalid question. Needs to be difference, correlation, or category.')

need to reformat to tidy data and then it will be good for pairwise turkey_hsd

In [65]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
pairwise_tukeyhsd?

[0;31mSignature:[0m [0mpairwise_tukeyhsd[0m[0;34m([0m[0mendog[0m[0;34m,[0m [0mgroups[0m[0;34m,[0m [0malpha[0m[0;34m=[0m[0;36m0.05[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Calculate all pairwise comparisons with TukeyHSD confidence intervals

Parameters
----------
endog : ndarray, float, 1d
    response variable
groups : ndarray, 1d
    array with groups, can be string or integers
alpha : float
    significance level for the test

Returns
-------
results : TukeyHSDResults instance
    A results class containing relevant data and some post-hoc
    calculations, including adjusted p-value

Notes
-----
This is just a wrapper around tukeyhsd method of MultiComparison

See Also
--------
MultiComparison
tukeyhsd
statsmodels.sandbox.stats.multicomp.TukeyHSDResults
[0;31mFile:[0m      ~/anaconda3/lib/python3.11/site-packages/statsmodels/stats/multicomp.py
[0;31mType:[0m      function

In [67]:
pairwise_tukeyhsd(endog=data[['Var1','Var2','Var3','Var4','Var5']],groups=['Var1','Var2','Var3','Var4','Var5'])

ValueError: data has 10 elements and groups has 5

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Assuming you have a DataFrame df with a column 'values' and a column 'group'
tukey_result = pairwise_tukeyhsd(data['values'], data['group'], alpha=0.05)
print(tukey_result)

Post-hoc tests are used after a one-way ANOVA when you find a statistically significant difference between the group means. These tests help you determine which specific groups differ from each other. Here are some common post-hoc tests used after a one-way ANOVA:

1. Tukey's Honestly Significant Difference (HSD) Test
Purpose: The most commonly used post-hoc test, Tukey's HSD, compares all possible pairs of group means to identify which ones differ significantly.
Usage: It's suitable when you have equal or unequal group sizes.
Implementation:
In Python, you can use pairwise_tukeyhsd from statsmodels.stats.multicomp.
python
Copy code
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Assuming you have a DataFrame df with a column 'values' and a column 'group'
tukey_result = pairwise_tukeyhsd(df['values'], df['group'], alpha=0.05)
print(tukey_result)
2. Bonferroni Correction
Purpose: Adjusts the significance level to account for the number of comparisons being made, reducing the risk of Type I errors.
Usage: You apply it when performing multiple pairwise t-tests. It's a more conservative approach.
Implementation:
You would manually adjust the p-value threshold by dividing the original alpha level by the number of comparisons.
3. Scheffé's Test
Purpose: A very conservative test, Scheffé's test can be used when comparing complex comparisons (not just pairwise). It's suitable when you suspect non-pairwise differences.
Usage: It's less commonly used due to its conservative nature.
Implementation: It's available in some statistical software, but not directly implemented in popular Python libraries.
4. Dunnett's Test
Purpose: Compares multiple treatment groups against a single control group.
Usage: Ideal when you have one control group and want to compare it against several other groups.
Implementation: Specialized packages or functions may be required for this in Python.
5. Holm’s Sequential Bonferroni Procedure
Purpose: A step-down procedure that adjusts p-values to control the family-wise error rate. It's less conservative than the Bonferroni correction.
Usage: Useful when dealing with multiple comparisons.
Implementation: You can use multipletests from statsmodels.
python
Copy code
from statsmodels.stats.multitest import multipletests

# Assuming pvals is a list or array of p-values from multiple tests
pvals_corrected = multipletests(pvals, method='holm')
print(pvals_corrected)
6. Dunn’s Test
Purpose: A non-parametric test used when the ANOVA assumptions are violated. It is used after a Kruskal-Wallis test (non-parametric alternative to ANOVA).
Usage: Suitable when you have non-normally distributed data.
Implementation: The scikit-posthocs package can be used for Dunn's test in Python.

In [57]:
stats.f_oneway(data[['Var1','Var2','Var3','Var4','Var5']])

TypeError: at least two inputs are required; got 1.

In [59]:
data

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,c
0,0.37454,0.020584,0.611853,0.607545,0.122038,X
1,0.950714,0.96991,0.139494,0.170524,0.495177,X
2,0.731994,0.832443,0.292145,0.065052,0.034389,Y
3,0.598658,0.212339,0.366362,0.948886,0.90932,Y
4,0.156019,0.181825,0.45607,0.965632,0.25878,Y
5,0.155995,0.183405,0.785176,0.808397,0.662522,X
6,0.058084,0.304242,0.199674,0.304614,0.311711,X
7,0.866176,0.524756,0.514234,0.097672,0.520068,Y
8,0.601115,0.431945,0.592415,0.684233,0.54671,Y
9,0.708073,0.291229,0.04645,0.440152,0.184854,Y


In [62]:
compare = ['Var1','Var2']
t_stat, p_value = stats.ttest_ind(data[compare[0]],data[compare[1]])
pd.DataFrame({'t_stat': t_stat, 'p_value': p_value},index=pd.Index([','.join(compare)],name='compare')).reset_index()

Unnamed: 0,compare,t_stat,p_value
0,"Var1,Var2",0.903049,0.378427


In [64]:
compare = ['Var1','Var2','Var3']
f_stat, p_value = stats.f_oneway(*(data[col] for col in compare))
pd.DataFrame({'f_stat': f_stat, 'p_value': p_value},index=pd.Index([','.join(compare)],name='compare')).reset_index()

Unnamed: 0,compare,f_stat,p_value
0,"Var1,Var2,Var3",0.609184,0.5511


In [54]:
stats.f_oneway?

[0;31mSignature:[0m [0mstats[0m[0;34m.[0m[0mf_oneway[0m[0;34m([0m[0;34m*[0m[0msamples[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Perform one-way ANOVA.

The one-way ANOVA tests the null hypothesis that two or more groups have
the same population mean.  The test is applied to samples from two or
more groups, possibly with differing sizes.

Parameters
----------
sample1, sample2, ... : array_like
    The sample measurements for each group.  There must be at least
    two arguments.  If the arrays are multidimensional, then all the
    dimensions of the array must be the same except for `axis`.
axis : int, optional
    Axis of the input arrays along which the test is applied.
    Default is 0.

Returns
-------
statistic : float
    The computed F statistic of the test.
pvalue : float
    The associated p-value from the F distribution.

Warns
-----
    Raised if all values within each of the input arrays are

In [53]:
stats.ttest_ind?

[0;31mSignature:[0m
[0mstats[0m[0;34m.[0m[0mttest_ind[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0ma[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mb[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mequal_var[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnan_policy[0m[0;34m=[0m[0;34m'propagate'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpermutations[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malternative[0m[0;34m=[0m[0;34m'two-sided'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrim[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeepdims[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Calculate the T-test f