In [1]:
in_file_path = "../data/processed/data_master.pkl"
out_file_path = "../data/processed/data_model_master.pkl"

In [2]:
from os.path import dirname
import os, sys, inspect

currentdir = os.getcwd()
parentdir = dirname(currentdir)

sys.path.insert(0,parentdir)

# Libs

In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

from feature_engine.categorical_encoders import MeanCategoricalEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder

from src.utils import dump_to_pickle

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

# Load Data

In [4]:
df = pd.read_pickle(in_file_path)

In [5]:
df.shape

(2101, 25)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2101 entries, 0 to 2354
Data columns (total 25 columns):
NIM                  2101 non-null object
form_number          2101 non-null object
major_code_opcs      2101 non-null object
major_name_opcs      2101 non-null object
Last_CGPA            2101 non-null float64
ENG                  2101 non-null float64
MATH                 2101 non-null float64
BIO                  2101 non-null float64
CHEM                 2101 non-null float64
PHY                  2101 non-null float64
ECON                 2101 non-null float64
GEO                  2101 non-null float64
SOC                  2101 non-null float64
FINAL                2101 non-null float64
major_code_oa        2101 non-null object
major_name_oa        2101 non-null object
gender               2101 non-null object
school_prop          2101 non-null object
school_name          2002 non-null object
school_state         2101 non-null object
school_geo_unit      2101 non-null object
c

In [7]:
df.isnull().sum()

NIM                   0
form_number           0
major_code_opcs       0
major_name_opcs       0
Last_CGPA             0
ENG                   0
MATH                  0
BIO                   0
CHEM                  0
PHY                   0
ECON                  0
GEO                   0
SOC                   0
FINAL                 0
major_code_oa         0
major_name_oa         0
gender                0
school_prop           0
school_name          99
school_state          0
school_geo_unit       0
curriculum_name       0
fail                  0
has_changed_major     0
faculty               0
dtype: int64

In [8]:
df.head()

Unnamed: 0,NIM,form_number,major_code_opcs,major_name_opcs,Last_CGPA,ENG,MATH,BIO,CHEM,PHY,ECON,GEO,SOC,FINAL,major_code_oa,major_name_oa,gender,school_prop,school_name,school_state,school_geo_unit,curriculum_name,fail,has_changed_major,faculty
0,1011180001,1831100382,1011,Manajemen,3.25,73.25,70.75,0.0,0.0,0.0,86.5,73.75,79.25,30.8,1011,Manajemen,Female,SMA,SMAN 1 SIDIKALANG,SUMUT,SUM,Social,0,0,BS
1,1011180002,1811100709,1011,Manajemen,1.06,77.75,64.75,0.0,0.0,0.0,79.25,80.0,76.25,25.95,1011,Manajemen,Male,SMA,SMA Islam Al-Azhar 8 Bekasi,JABAR,JAV,Social,1,0,BS
2,1011180003,1811100388,1011,Manajemen,2.07,70.25,66.75,0.0,0.0,0.0,79.5,77.5,82.25,27.4,1011,Manajemen,Male,SMA,SMA Katolik St. Peter,NTT,SUN,Social,0,0,BS
3,1011180004,1831100016,1011,Manajemen,2.91,82.25,85.0,0.0,0.0,0.0,71.75,77.75,72.75,28.4,1011,Manajemen,Male,SMA,SMA El Shadai Magelang,JATENG,JAV,Social,0,0,BS
4,1011180005,1811100684,1011,Manajemen,3.28,85.25,78.0,80.25,75.5,78.5,0.0,0.0,0.0,33.9,1011,Manajemen,Male,SMK,SMK Eran Batu 2,SULSEL,SUL,Science,0,0,BS


# Feature Engineering

#### Drop Unused Features

In [9]:
id_cols = [
    'NIM', 
    'form_number',
    'gender'
]

cols_to_drop = [
    'Last_CGPA',
    'major_code_opcs',
    'major_code_oa',
    'major_name_oa',
    'school_name',
    'curriculum_name',
    'has_changed_major'
]

In [10]:
data = df.drop(id_cols+cols_to_drop, axis=1)

#### Rename Columns

In [11]:
data.rename(columns={
    'major_name_opcs': 'major_name',
    'ENG': 'hs_eng',
    'MATH': 'hs_math',
    'BIO': 'hs_bio',
    'CHEM': 'hs_chem',
    'PHY': 'hs_phy',
    'ECON': 'hs_econ',
    'GEO': 'hs_geo',
    'SOC': 'hs_soc',
    'FINAL': 'hs_final',
}, inplace=True)

#### School Prop: Uniting Non-Dominant Categories

In [12]:
# sch_prop = data.school_prop.unique()
# sch_prop_oth = [c for c in sch_prop if c != 'SMA']

In [13]:
# data.loc[data.school_prop.isin(sch_prop_oth), 'school_prop'] = 'Other'

In [14]:
# sns.countplot(data.school_prop)

#### School Geographical Unit: Uniting Non-Dominant Categories

In [15]:
# sch_geo_unit = data.school_geo_unit.unique()
# sch_geo_unit_oth = [c for c in sch_geo_unit if c != 'JAV']

In [16]:
# data.loc[data.school_geo_unit.isin(sch_geo_unit_oth), 'school_geo_unit'] = 'Other'

In [17]:
# sns.countplot(data.school_geo_unit)

## Transform Categorical and Numerical Features

In [18]:
target_var = ['fail']

cat_vars = [
    'major_name',
    'school_prop',
    'school_geo_unit',
    'school_state',
    'faculty'
]

num_vars = [c for c in data.drop(target_var, axis=1).columns.values if c not in cat_vars]

### Encode Categorical Features

In [19]:
cat_df = data[cat_vars+target_var]

In [20]:
encoded_data = cat_df.copy()

###### Mean Encoder

In [21]:
# mean_encoder = MeanCategoricalEncoder(variables=cat_vars)

In [22]:
# mean_encoder.fit(encoded_data, encoded_data[target_var])

In [23]:
# encoded_data = mean_encoder.transform(encoded_data)

In [24]:
# encoded_data.head()

###### Label Encoder

In [25]:
le = LabelEncoder()

In [26]:
for col in cat_vars:
    encoded_data[col] = le.fit_transform(encoded_data[col]) 

In [27]:
#save encoded dict
le.fit(cat_df['major_name'])

le_dict = dict(zip(le.classes_, le.transform(le.classes_)))
dump_to_pickle(le_dict, "../app/data/major_le_dict.pkl")

In [28]:
encoded_data.head()

Unnamed: 0,major_name,school_prop,school_geo_unit,school_state,faculty,fail
0,14,4,6,33,0,0
1,14,4,0,7,0,1
2,14,4,7,21,0,0
3,14,4,0,9,0,0
4,14,5,5,27,0,0


### Scale Numerical Features: Standard Scaler

In [29]:
scaler = StandardScaler()

In [30]:
scaled_data = pd.DataFrame(scaler.fit_transform(data[num_vars]),
                          columns=data[num_vars].columns,
                          index=data.index)

In [31]:
scaled_data.head()

Unnamed: 0,hs_eng,hs_math,hs_bio,hs_chem,hs_phy,hs_econ,hs_geo,hs_soc,hs_final
0,-1.048565,-0.848793,-1.031378,-1.030655,-1.030533,1.223719,0.889068,1.019979,-0.152911
1,-0.451131,-1.541255,-1.031378,-1.030655,-1.030533,1.041207,1.045342,0.945211,-1.307819
2,-1.446854,-1.310434,-1.031378,-1.030655,-1.030533,1.047501,0.982833,1.094747,-0.962537
3,0.146302,0.795803,-1.031378,-1.030655,-1.030533,0.852403,0.989084,0.857982,-0.724412
4,0.544591,-0.012069,0.937199,0.833885,0.921328,-0.95383,-0.954972,-0.955133,0.585277


#### Concat

In [32]:
transformed_data = pd.concat([scaled_data, encoded_data], axis=1)

In [33]:
transformed_data.head()

Unnamed: 0,hs_eng,hs_math,hs_bio,hs_chem,hs_phy,hs_econ,hs_geo,hs_soc,hs_final,major_name,school_prop,school_geo_unit,school_state,faculty,fail
0,-1.048565,-0.848793,-1.031378,-1.030655,-1.030533,1.223719,0.889068,1.019979,-0.152911,14,4,6,33,0,0
1,-0.451131,-1.541255,-1.031378,-1.030655,-1.030533,1.041207,1.045342,0.945211,-1.307819,14,4,0,7,0,1
2,-1.446854,-1.310434,-1.031378,-1.030655,-1.030533,1.047501,0.982833,1.094747,-0.962537,14,4,7,21,0,0
3,0.146302,0.795803,-1.031378,-1.030655,-1.030533,0.852403,0.989084,0.857982,-0.724412,14,4,0,9,0,0
4,0.544591,-0.012069,0.937199,0.833885,0.921328,-0.95383,-0.954972,-0.955133,0.585277,14,5,5,27,0,0


# Dump to Pickle

In [34]:
dump_to_pickle(transformed_data, out_file_path)