# Class 4. Intermediate Python & AI 

# Numerical EDA & Feature Engineering

### Table of contents

1. Numerical EDA (Class 4 Code)
    1. Univariate analysis
    2. Bivariate analysis (Categorical + Numerical)
    3. Multivariate analysis
2. Feature Engineering

In [1]:
import pandas as pd

In [61]:
df = pd.read_csv('../../datasets/exams_mod_cleaned.csv')

In [62]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age
0,male,group A,high school,standard,completed,67,67,63,1000,2022,17.0
1,male,group E,some college,free/reduced,none,59,60,50,1002,2022,17.0
2,male,group B,high school,standard,none,77,78,68,1003,2022,17.0
3,male,group E,associate's degree,standard,completed,78,73,68,1004,2022,17.0
4,female,group D,high school,standard,none,63,77,76,1005,2022,17.0


In [63]:
df.isna().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
id_student                     0
Year                           0
Age                            0
dtype: int64

In [64]:
df['gender'].value_counts()

male      497
female    466
Name: gender, dtype: int64

## 2. Feature Engineering

### Parental Education

In [4]:
df['parental level of education'].value_counts()

some college          219
associate's degree    196
high school           194
some high school      180
bachelor's degree     107
master's degree        67
Name: parental level of education, dtype: int64

In [5]:
new_col = []
for el in df['parental level of education'].tolist():
    if el in ["master's degree", "bachelor's degree", "some college"]:
        new_col.append('went to college')
    else:
        new_col.append('no college')
df['Education'] = new_col

In [6]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age,Education
0,male,group A,high school,standard,completed,67,67,63,1000,2022,17.0,no college
1,male,group E,some college,free/reduced,none,59,60,50,1002,2022,17.0,went to college
2,male,group B,high school,standard,none,77,78,68,1003,2022,17.0,no college
3,male,group E,associate's degree,standard,completed,78,73,68,1004,2022,17.0,no college
4,female,group D,high school,standard,none,63,77,76,1005,2022,17.0,no college


In [7]:
df['Education'].value_counts()

no college         570
went to college    393
Name: Education, dtype: int64

### BMI Calculation

In [8]:
df['height'] = 1.70
df['weight'] = 70

In [9]:
df['BMI'] =  round(df['weight'] / (df['height'] * df['height']), 2)

In [10]:
df.drop(['height', 'weight'], axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age,Education,BMI
0,male,group A,high school,standard,completed,67,67,63,1000,2022,17.0,no college,24.22
1,male,group E,some college,free/reduced,none,59,60,50,1002,2022,17.0,went to college,24.22
2,male,group B,high school,standard,none,77,78,68,1003,2022,17.0,no college,24.22
3,male,group E,associate's degree,standard,completed,78,73,68,1004,2022,17.0,no college,24.22
4,female,group D,high school,standard,none,63,77,76,1005,2022,17.0,no college,24.22


### Scores mean

In [12]:
df['exams mean'] = round((df['reading score'] + df['math score'] + df['writing score']) / 3, 2)

In [13]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age,Education,BMI,exams mean
0,male,group A,high school,standard,completed,67,67,63,1000,2022,17.0,no college,24.22,65.67
1,male,group E,some college,free/reduced,none,59,60,50,1002,2022,17.0,went to college,24.22,56.33
2,male,group B,high school,standard,none,77,78,68,1003,2022,17.0,no college,24.22,74.33
3,male,group E,associate's degree,standard,completed,78,73,68,1004,2022,17.0,no college,24.22,73.0
4,female,group D,high school,standard,none,63,77,76,1005,2022,17.0,no college,24.22,72.0


# Categorical Encoding

In [14]:
df["parent_ed_cat_encode"] = df["parental level of education"]\
    .astype('category').cat.codes

In [15]:
df[['parental level of education', 'parent_ed_cat_encode']].head()

Unnamed: 0,parental level of education,parent_ed_cat_encode
0,high school,2
1,some college,4
2,high school,2
3,associate's degree,0
4,high school,2


#### You can also do it with scikit learn

In [34]:
from sklearn import preprocessing

In [36]:
le = preprocessing.LabelEncoder()
le.fit(df['parental level of education'])
df["parent_ed_cat_encode_2"] = le.transform(df['parental level of education'])

In [42]:
assert df["parent_ed_cat_encode"].all() == df["parent_ed_cat_encode_2"].all()

# One hot Encoding

In [47]:
pd.get_dummies(df['gender'], columns=['gender']).head()

Unnamed: 0,female,male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [44]:
data = pd.get_dummies(
        df, columns=['gender', 'race/ethnicity', 
     'parental level of education', 'lunch', 'test preparation course'], 
drop_first=True)
data.head()

Unnamed: 0,math score,reading score,writing score,id_student,Year,Age,Education,BMI,exams mean,parent_ed_cat_encode,...,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,67,67,63,1000,2022,17.0,no college,24.22,65.67,2,...,0,0,0,0,1,0,0,0,1,0
1,59,60,50,1002,2022,17.0,went to college,24.22,56.33,4,...,0,0,1,0,0,0,1,0,0,1
2,77,78,68,1003,2022,17.0,no college,24.22,74.33,2,...,0,0,0,0,1,0,0,0,1,1
3,78,73,68,1004,2022,17.0,no college,24.22,73.0,0,...,0,0,1,0,0,0,0,0,1,0
4,63,77,76,1005,2022,17.0,no college,24.22,72.0,2,...,0,1,0,0,1,0,0,0,1,1


# Data Normalization 

In [49]:
scaler = preprocessing.MinMaxScaler()

In [50]:
df[['norm_math', 'norm_read']] = scaler.fit_transform(df[['math score', 'reading score']])

In [51]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,id_student,Year,Age,Education,BMI,exams mean,parent_ed_cat_encode,parent_ed_cat_encode_2,norm_math,norm_read
0,male,group A,high school,standard,completed,67,67,63,1000,2022,17.0,no college,24.22,65.67,2,2,0.62069,0.547945
1,male,group E,some college,free/reduced,none,59,60,50,1002,2022,17.0,went to college,24.22,56.33,4,4,0.528736,0.452055
2,male,group B,high school,standard,none,77,78,68,1003,2022,17.0,no college,24.22,74.33,2,2,0.735632,0.69863
3,male,group E,associate's degree,standard,completed,78,73,68,1004,2022,17.0,no college,24.22,73.0,0,0,0.747126,0.630137
4,female,group D,high school,standard,none,63,77,76,1005,2022,17.0,no college,24.22,72.0,2,2,0.574713,0.684932


In [59]:
data = [[10], [25], [53], [42]]
scaler = preprocessing.MinMaxScaler()
scaler.fit_transform(data)

array([[0.        ],
       [0.34883721],
       [1.        ],
       [0.74418605]])