In [21]:
import pandas as pd
import numpy as np

### GroupBy

In [2]:
students = pd.read_csv('StudentsPerformance.csv')

In [3]:
students.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [4]:
# colunas referentes as provadas
provas = students.columns.to_list()[-3:]

In [5]:
provas

['math score', 'reading score', 'writing score']

In [6]:
# média das provas
students['mean'] = students[provas].mean(axis = 1).round(2)

In [7]:
students

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mean
0,female,group B,bachelor's degree,standard,none,72,72,74,72.67
1,female,group C,some college,standard,completed,69,90,88,82.33
2,female,group B,master's degree,standard,none,90,95,93,92.67
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.33
4,male,group C,some college,standard,none,76,78,75,76.33
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,94.00
996,male,group C,high school,free/reduced,none,62,55,55,57.33
997,female,group C,high school,free/reduced,completed,59,71,65,65.00
998,female,group D,some college,standard,completed,68,78,77,74.33


In [8]:
# quantidade de valores únicos em cada coluna
students.nunique()

gender                           2
race/ethnicity                   5
parental level of education      6
lunch                            2
test preparation course          2
math score                      81
reading score                   72
writing score                   77
mean                           194
dtype: int64

In [9]:
students.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score', 'mean'],
      dtype='object')

In [10]:
students.dtypes

gender                          object
race/ethnicity                  object
parental level of education     object
lunch                           object
test preparation course         object
math score                       int64
reading score                    int64
writing score                    int64
mean                           float64
dtype: object

In [11]:
# proporção entre os gêneros (absoluto)
students.gender.value_counts()

female    518
male      482
Name: gender, dtype: int64

In [15]:
518/students.shape[0] * 100

51.800000000000004

In [16]:
# proporção entre os gêneros (percentual)
students.gender.value_counts(normalize = True) * 100

female    51.8
male      48.2
Name: gender, dtype: float64

In [17]:
# sumário estatístico
students.describe()

Unnamed: 0,math score,reading score,writing score,mean
count,1000.0,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054,67.77058
std,15.16308,14.600192,15.195657,14.257311
min,0.0,17.0,10.0,9.0
25%,57.0,59.0,57.75,58.33
50%,66.0,70.0,69.0,68.33
75%,77.0,79.0,79.0,77.67
max,100.0,100.0,100.0,100.0


Vamos obter estatística descritivas de acordo com algumas variáveis categóricas do dataset.

In [18]:
students.head(1)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mean
0,female,group B,bachelor's degree,standard,none,72,72,74,72.67


In [19]:
# média em matemática por gênero
students.groupby(by = 'gender')['math score'].mean()

gender
female    63.633205
male      68.728216
Name: math score, dtype: float64

Vamos supor que, caso o aluno tenha pontuado acima de 70 pontos, tenha sido aprovado. Assim, vamos mapear os possíveis casos de forma agregada.

In [22]:
# estatísticas descritivas de matemática por gênero
students.groupby(by = 'gender')['math score'].agg([min, np.mean, np.std, np.median, max])

Unnamed: 0_level_0,min,mean,std,median,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0,63.633205,15.491453,65,100
male,27,68.728216,14.356277,69,100


In [24]:
students.groupby('gender')[provas].agg([min, np.mean, np.median, max])

Unnamed: 0_level_0,math score,math score,math score,math score,reading score,reading score,reading score,reading score,writing score,writing score,writing score,writing score
Unnamed: 0_level_1,min,mean,median,max,min,mean,median,max,min,mean,median,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,0,63.633205,65,100,17,72.608108,73,100,10,72.467181,74,100
male,27,68.728216,69,100,23,65.473029,66,100,15,63.311203,64,100


Agora vamos agrupar por gênero e lunch (tipo de refeição do aluno).

In [25]:
metrics = [np.mean, np.std, np.median, min, max]

In [27]:
students

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,mean
0,female,group B,bachelor's degree,standard,none,72,72,74,72.67
1,female,group C,some college,standard,completed,69,90,88,82.33
2,female,group B,master's degree,standard,none,90,95,93,92.67
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.33
4,male,group C,some college,standard,none,76,78,75,76.33
...,...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95,94.00
996,male,group C,high school,free/reduced,none,62,55,55,57.33
997,female,group C,high school,free/reduced,completed,59,71,65,65.00
998,female,group D,some college,standard,completed,68,78,77,74.33


In [28]:
students.groupby(by = ['gender', 'lunch', 'test preparation course'])[provas].agg(metrics).T

Unnamed: 0_level_0,gender,female,female,female,female,male,male,male,male
Unnamed: 0_level_1,lunch,free/reduced,free/reduced,standard,standard,free/reduced,free/reduced,standard,standard
Unnamed: 0_level_2,test preparation course,completed,none,completed,none,completed,none,completed,none
math score,mean,60.785714,52.890756,71.131579,66.530233,65.639344,60.609524,75.955752,69.832512
math score,std,14.219697,15.502755,12.807985,13.792446,14.336472,13.548698,12.83229,13.278492
math score,median,63.0,53.0,69.0,65.0,65.0,61.0,76.0,70.0
math score,min,23.0,0.0,32.0,19.0,39.0,27.0,46.0,30.0
math score,max,93.0,81.0,99.0,100.0,100.0,93.0,100.0,97.0
reading score,mean,73.428571,63.831933,79.798246,73.386047,65.786885,59.07619,72.60177,64.719212
reading score,std,13.776236,15.140203,11.382474,13.124308,13.559885,13.355851,13.177727,13.00552
reading score,median,74.5,64.0,79.5,72.0,68.0,59.0,74.0,66.0
reading score,min,40.0,17.0,51.0,29.0,37.0,23.0,41.0,26.0
reading score,max,100.0,92.0,100.0,100.0,100.0,90.0,100.0,96.0


In [29]:
students.groupby('parental level of education')[provas].mean()

Unnamed: 0_level_0,math score,reading score,writing score
parental level of education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
associate's degree,67.882883,70.927928,69.896396
bachelor's degree,69.389831,73.0,73.381356
high school,62.137755,64.704082,62.44898
master's degree,69.745763,75.372881,75.677966
some college,67.128319,69.460177,68.840708
some high school,63.497207,66.938547,64.888268


In [31]:
(students.groupby(['race/ethnicity', 'parental level of education'])
 [provas]).agg(np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,math score,reading score,writing score
race/ethnicity,parental level of education,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
group A,associate's degree,61.0,67.071429,63.571429
group A,bachelor's degree,67.166667,68.083333,68.333333
group A,high school,60.444444,62.888889,60.5
group A,master's degree,57.666667,64.666667,67.666667
group A,some college,63.888889,65.777778,65.0
group A,some high school,58.916667,62.083333,58.583333
group B,associate's degree,66.097561,69.585366,68.243902
group B,bachelor's degree,69.3,72.95,71.65
group B,high school,59.791667,63.458333,61.25
group B,master's degree,67.166667,80.166667,77.166667
