In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sp

In [2]:
df = pd.read_csv("World2017literacyRate.csv",encoding='latin1')



In [3]:
df.head()

Unnamed: 0,Country,Literacy_Rate
0,Albania,96
1,Algeria,73
2,Angola,70
3,Antigua and Barbuda,99
4,Argentina,98


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 2 columns):
Country          147 non-null object
Literacy_Rate    147 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.4+ KB


### Application goal: Use scipy stats percentileofscore and Pandas qcut to: <br>
> 1. Add percentile value for each literacy rate and country
> 2. Classify the data using the same percentiles into Low, Medium and High terciles groups
> 3. We will use sp.percentileofscore(x,a,kind='weak) 

In [5]:
x = df['Literacy_Rate']

In [6]:
df['Percentile'] = [sp.percentileofscore(x, a, kind='weak') for a in x ]

In [7]:
df.head()

Unnamed: 0,Country,Literacy_Rate,Percentile
0,Albania,96,72.108844
1,Algeria,73,29.931973
2,Angola,70,24.489796
3,Antigua and Barbuda,99,89.115646
4,Argentina,98,81.632653


In [9]:
df = df.sort_values(by=['Percentile'],ascending=True)

In [10]:
df.head()

Unnamed: 0,Country,Literacy_Rate,Percentile
18,Burkina Faso,29,1.360544
93,Niger,29,1.360544
80,Mali,31,2.040816
25,Chad,34,2.721088
46,Ethiopia,39,3.401361


In [11]:
df['Classification'] = pd.qcut(df['Literacy_Rate'],3,labels=['Low','Middle','High'])

In [13]:
df.head(20)

Unnamed: 0,Country,Literacy_Rate,Percentile,Classification
18,Burkina Faso,29,1.360544,Low
93,Niger,29,1.360544,Low
80,Mali,31,2.040816,Low
25,Chad,34,2.721088,Low
46,Ethiopia,39,3.401361,Low
53,Guinea,41,4.081633,Low
115,Sierra Leone,42,5.442177,Low
10,Benin,42,5.442177,Low
55,Haiti,49,6.122449,Low
48,Gambia,50,7.482993,Low


In [14]:
bygroup = df.groupby(['Classification'])['Literacy_Rate']

In [16]:
df2 = bygroup.aggregate(['count', 'min', 'max', 'mean', 'median', 'std', 'mad', 'skew']).round(1)

In [17]:
df2

Unnamed: 0_level_0,count,min,max,mean,median,std,mad,skew
Classification,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Low,51,29,78,60.1,61,13.3,10.6,-0.8
Middle,50,83,95,90.4,91,3.5,2.9,-0.6
High,46,96,100,98.7,99,1.3,1.1,-0.7
