In [64]:
import pandas as pd
pd.set_option('display.max_rows', 1000)

### Advantages using Pandas statistical functions <br>  Missing values are removed prior to computations

In [65]:
dfRate = pd.read_csv("World2017literacyRate.csv",encoding='latin1')

In [66]:
dfMortality = pd.read_csv("InfantMortalityGNI.csv",encoding='latin1')

In [67]:
dfRate.head()

Unnamed: 0,Country,Literacy_Rate
0,Albania,96
1,Algeria,73
2,Angola,70
3,Antigua and Barbuda,99
4,Argentina,98


In [68]:
dfMortality.head()

Unnamed: 0,Countries,GNI_PER_CAPITA,Infant_Mortality
0,Afghanistan,590.0,73.2
1,Albania,4290.0,14.0
2,Algeria,4800.0,25.5
3,Andorra,,2.8
4,Angola,4040.0,86.5


## Correlation with Pandas <br> ‘pearson’, ‘kendall’, ‘spearman’} default is pearson



### Pearson correlation coefficient
1. Normal distribution 
2. Linear assumption
3. No outliers in data

In [69]:
dfMortality.corr()

Unnamed: 0,GNI_PER_CAPITA,Infant_Mortality
GNI_PER_CAPITA,1.0,-0.510167
Infant_Mortality,-0.510167,1.0


In [70]:
dfMortality.corr("pearson")

Unnamed: 0,GNI_PER_CAPITA,Infant_Mortality
GNI_PER_CAPITA,1.0,-0.510167
Infant_Mortality,-0.510167,1.0


### Spearman rank correlation
1. Is a nonparametric test 
2. Assumption of a monotonic relationship. 
3. Is less sensitive to outliers than Pearson's corr

In [71]:
dfMortality.corr("spearman")

Unnamed: 0,GNI_PER_CAPITA,Infant_Mortality
GNI_PER_CAPITA,1.0,-0.8695
Infant_Mortality,-0.8695,1.0


### Kendall tau rank
1. Is a nonparametric equivalent
2. Requires a monotonic relationship

In [72]:
dfMortality.corr("kendall")

Unnamed: 0,GNI_PER_CAPITA,Infant_Mortality
GNI_PER_CAPITA,1.0,-0.67478
Infant_Mortality,-0.67478,1.0


### Other Statistical functions available with PANDAS <br> Fisher’s definition of kurtosis (kurtosis of normal == 0.0

In [73]:
Count = dfRate['Literacy_Rate'].count()
Min   = dfRate['Literacy_Rate'].min()
Max   = dfRate['Literacy_Rate'].max()
Mode  = dfRate['Literacy_Rate'].mode()
SEM   = dfRate['Literacy_Rate'].sem()
Skew = dfRate['Literacy_Rate'].skew()
Kurtosis = dfRate['Literacy_Rate'].kurt()

In [74]:
print("The count of observations is: ",Count)
print("The minimum observation is: ",Min)
print("The maximum observation is: ",Max)
print("The mode is:",Mode)
print("The standard error of the mean is: ",SEM)
print("The kurtosis is: ", round(Kurtosis,2))

The count of observations is:  147
The minimum observation is:  29
The maximum observation is:  100
The mode is: 0    100
dtype: int64
The standard error of the mean is:  1.5304295856
The kurtosis is:  0.27


In [75]:
print("Mode is",Mode.iloc[0])

Mode is 100


### Computing percentiles within Pandas

In [76]:
Percentile25th = dfRate['Literacy_Rate'].quantile(0.25,interpolation= 'linear')  # to match numpy percentile

In [77]:
Percentile25th

71.0

In [78]:
dfRate.describe()

Unnamed: 0,Literacy_Rate
count,147.0
mean,82.469388
std,18.555473
min,29.0
25%,71.0
50%,90.0
75%,98.0
max,100.0


In [None]:
dfRate.head()

### Ranking data within Pandas <br> method : {‘average’, ‘min’, ‘max’, ‘first’, ‘dense’}

        average: average rank of group
        min: lowest rank in group
        max: highest rank in group
        first: ranks assigned in order they appear in the array
        dense: like ‘min’, but rank always increases by 1 between groups



In [87]:
dfRate['Rank'] = dfRate['Literacy_Rate'].rank(method='min',ascending=False, pct=False)

In [88]:
dfRate.head(160)

Unnamed: 0,Country,Literacy_Rate,Rank
0,Albania,96,42.0
1,Algeria,73,104.0
2,Angola,70,112.0
3,Antigua and Barbuda,99,17.0
4,Argentina,98,28.0
5,Armenia,100,1.0
6,Azerbaijan,100,1.0
7,Bahrain,92,65.0
8,Bangladesh,57,128.0
9,Belarus,100,1.0
