# NP Predictive Analytics Assignment - Data Preparation
In this notebook we prepare the dataset for modeling

In [45]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler

from data import dataset, dataprep

sns.set(style="whitegrid")

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data
Load dataframe from EDA:

In [49]:
df = pd.read_feather(os.path.join("build", "dataset.feather"))
len(df)

1044

## Inputs and Outputs
Extract & separate the target output variable from the rest of the data
- final scores `G3` for regression task
- final grades `G3Binnned` for classification task


In [53]:
final_scores = df[dataprep.reg_target_var]
final_grades = df[dataprep.classify_target_var]
inputs_df = df.drop(columns=[dataprep.reg_target_var, 
                             dataprep.classify_target_var])

Encode categories into numeric integers

In [54]:
grade_mapping = list(enumerate(final_grades.cat.categories))
final_grades = final_grades.cat.codes
grade_mapping

[(0, 'F'), (1, 'D'), (2, 'C'), (3, 'B'), (4, 'A')]

## Split Dataset
Set aside hold out test set for later cross validation
- use stratified sampling by final grade `G3Binned` to select examples for test set

> Since dataset is relatively small, we stratify the sample by the 
> final grade to limit the error introduced by sampling bias

In [55]:
# generate randomly split indexes that define split 

train_idxs, test_idxs = train_test_split(list(range(len(df))), 
                                         shuffle=True,
                                         stratify=final_grades,
                                         test_size=200)

# split dataset based on indexes
train_in_df, test_in_df = inputs_df.iloc[train_idxs], inputs_df.iloc[test_idxs]
train_scores, test_scores = final_scores[train_idxs], final_scores[test_idxs]
train_grades, test_grades = final_grades[train_idxs], final_grades[test_idxs]

## Build Data Pipeline
Build a Data Pipeline to preprocess the dataset:
1. Transform features into form suitable for modeling
    - numeric features - feature scale using Z-score transform
    - categorical features - feature extraction by one hot encoding
    - binary features - feature extraction by ordinal encoding
   

In [56]:
df_transformer = ColumnTransformer([
        ("num_pipeline", StandardScaler(), dataprep.num_features),
        ("cat_pipeline", OneHotEncoder(), dataprep.cat_features),
        ("bin_pipeline", OrdinalEncoder(), dataprep.bin_features)]
)

In [57]:
pipeline = Pipeline([
    ("transformer", df_transformer),
])

#TODO: things to try in data preparation
- PCA
- Isolation forest to remove outliers

## Prepare Data
Use the built data pipelines to prepare dataset for modeling

In [58]:
pipeline.fit(train_in_df)
train_data = pipeline.transform(train_in_df)
test_data = pipeline.transform(test_in_df)

### Data Imbalance
Perform random duplication of data example with a minority classes/label final grade to combat data imbalance.

> This is important as the data imbalance would cause the classifer
> to be biased towards the majority class/label instead of trying to
> predict all classes/labels accurately.

In [59]:
resampler = RandomOverSampler()
rs_train_data, rs_train_grades = resampler.fit_resample(train_data, train_grades)

## Commit Data
Commit prepared data and objects used to prepare data

In [62]:
np.savez_compressed(os.path.join("build", "dataprep.npz"),
                    # test data
                    test_data=test_data,
                    test_scores=test_scores,
                    test_grades=test_grades,
                    # regression training data
                    reg_train_data=train_data,
                    reg_train_scores=train_scores,
                    # classification data
                    classify_train_data=rs_train_data,
                    classify_train_grades=rs_train_grades,
                    train_idxs=train_idxs, test_idxs=test_idxs)

In [63]:
joblib.dump({
    "pipeline": pipeline,
    "grade_mapping": grade_mapping
}, os.path.join("build","dataprep.joblib"))

['build/dataprep.joblib']