## How to Carry out Descriptive Statistics in Python
This Jupyter Notebook contains a lot of descriptive statistic examples and how to carry them out in Python. Note, this is the code for the blog post (https://www.marsja.se/pandas-python-descriptive-statistics/). 

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode, gmean, hmean

### Simulate Data using Python:

In [2]:
N = 20
P = ["noise","quiet"]
Q = [1,2,3]

values = [[998,511], [1119,620], [1300,790]]

mus = np.concatenate([np.repeat(value, N) for value in values])

data = pd.DataFrame(data = {'id': [subid for subid in range(N)]*(len(P)*len(Q))
    ,'iv1': np.concatenate([np.array([p]*N) for p in P]*len(Q))
    ,'iv2': np.concatenate([np.array([q]*(N*len(P))) for q in Q])
    ,'rt': np.random.normal(mus, scale=112.0, size=N*len(P)*len(Q))})

### Summary Statistics using Pandas:

In [3]:
data.describe()

Unnamed: 0,id,iv2,rt
count,120.0,120.0,120.0
mean,9.5,2.0,895.187075
std,5.790459,0.81992,303.933688
min,0.0,1.0,256.061279
25%,4.75,1.0,621.471263
50%,9.5,2.0,901.873513
75%,14.25,3.0,1165.150726
max,19.0,3.0,1527.972882


#### Grouped Descriptive Statistics:

In [4]:
grouped_data = data.groupby(['iv1', 'iv2'])
grouped_data['rt'].describe().unstack()

Unnamed: 0_level_0,count,count,count,mean,mean,mean,std,std,std,min,...,25%,50%,50%,50%,75%,75%,75%,max,max,max
iv2,1,2,3,1,2,3,1,2,3,1,...,3,1,2,3,1,2,3,1,2,3
iv1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
noise,20.0,20.0,20.0,1026.951799,1135.312393,1297.068445,116.655172,106.88239,103.210228,845.521745,...,1227.450836,1001.114388,1118.947202,1284.946095,1084.212908,1224.959876,1339.939327,1239.15892,1360.870424,1527.972882
quiet,20.0,20.0,20.0,484.960325,647.627357,779.202128,97.308013,110.419398,128.151469,256.061279,...,696.056131,479.535898,645.823263,781.572539,556.72283,727.725189,859.728299,647.879485,852.280828,1032.398233


#### Getting the Mean Values in Pandas:

In [5]:
grouped_data['rt'].mean().reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1026.951799
1,noise,2,1135.312393
2,noise,3,1297.068445
3,quiet,1,484.960325
4,quiet,2,647.627357
5,quiet,3,779.202128


In [6]:
grouped_data['rt'].aggregate(np.mean).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1026.951799
1,noise,2,1135.312393
2,noise,3,1297.068445
3,quiet,1,484.960325
4,quiet,2,647.627357
5,quiet,3,779.202128


### Geometric & Harmonic Mean in Python

#### SciPy and Pandas Method:

In [7]:
grouped_data['rt'].apply(gmean, axis=None).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1020.844654
1,noise,2,1130.412242
2,noise,3,1293.177643
3,quiet,1,474.585599
4,quiet,2,638.322355
5,quiet,3,768.855048


#### Harmonic using Scipy & Pandas:

In [8]:
grouped_data['rt'].apply(hmean, axis=None).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1014.932257
1,noise,2,1125.368076
2,noise,3,1289.285734
3,quiet,1,462.783258
4,quiet,2,628.544592
5,quiet,3,758.169721


#### Trimmed Mean in Python

In [9]:
trimmed_mean = grouped_data['rt'].apply(trim_mean, .1)
trimmed_mean.reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1020.32715
1,noise,2,1137.905279
2,noise,3,1294.063819
3,quiet,1,490.396308
4,quiet,2,646.668327
5,quiet,3,780.845745


### Pandas Median

In [10]:
# Pandas Only:
# grouped_data['rt'].median().reset_index()
# Pandas + NumPy
grouped_data['rt'].aggregate(np.median).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,1001.114388
1,noise,2,1118.947202
2,noise,3,1284.946095
3,quiet,1,479.535898
4,quiet,2,645.823263
5,quiet,3,781.572539


### Scipy Mode

In [11]:
grouped_data['rt'].apply(mode, axis=None).reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,"([845.5217454205024], [1.0])"
1,noise,2,"([882.9328302299256], [1.0])"
2,noise,3,"([1067.3199212997692], [1.0])"
3,quiet,1,"([256.0612793705232], [1.0])"
4,quiet,2,"([403.0922504450879], [1.0])"
5,quiet,3,"([539.8025053743281], [1.0])"


### Median, Standard Deviation, Mean, and Trimmed Mean in a Pandas Dataframe

In [12]:
descr = grouped_data['rt'].aggregate([np.median, np.std, np.mean]).reset_index()
descr['trimmed_mean'] = pd.Series(trimmed_mean.values, index=descr.index)
descr

Unnamed: 0,iv1,iv2,median,std,mean,trimmed_mean
0,noise,1,1001.114388,116.655172,1026.951799,1020.32715
1,noise,2,1118.947202,106.88239,1135.312393,1137.905279
2,noise,3,1284.946095,103.210228,1297.068445,1294.063819
3,quiet,1,479.535898,97.308013,484.960325,490.396308
4,quiet,2,645.823263,110.419398,647.627357,646.668327
5,quiet,3,781.572539,128.151469,779.202128,780.845745


### Measures of Variability in Python

### Pandas Standard deviation

In [13]:
grouped_data['rt'].std().reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,116.655172
1,noise,2,106.88239
2,noise,3,103.210228
3,quiet,1,97.308013
4,quiet,2,110.419398
5,quiet,3,128.151469


### Inter quartile range

In [14]:
grouped_data.describe()['rt'][['25%', '50%', '75%']]

Unnamed: 0_level_0,Unnamed: 1_level_0,25%,50%,75%
iv1,iv2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
noise,1,940.329336,1001.114388,1084.212908
noise,2,1090.169199,1118.947202,1224.959876
noise,3,1227.450836,1284.946095,1339.939327
quiet,1,438.928725,479.535898,556.72283
quiet,2,575.828113,645.823263,727.725189
quiet,3,696.056131,781.572539,859.728299


### Pandas Variance

In [15]:
grouped_data['rt'].var().reset_index()

Unnamed: 0,iv1,iv2,rt
0,noise,1,13608.429057
1,noise,2,11423.845263
2,noise,3,10652.351196
3,quiet,1,9468.849482
4,quiet,2,12192.443446
5,quiet,3,16422.7989


In [16]:
## Saving Pandas Summary Statistics

In [17]:
descr.to_csv('Descriptive_Statistics_in_Python.csv', index=False)