# NP Predictive Analytics Assignment - Data Preparation
In this notebook we prepare the dataset for modeling

In [1]:
import os
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

from data.dataset import *

sns.set(style="whitegrid")

## Load Data
Load dataframe from EDA:

In [2]:
df = pd.read_feather(os.path.join("build", "dataset.feather"))

## Inputs and Outputs
Extract & separate the target output variable `G3` from the rest of the data

In [3]:
final_scores = df[target_var]
inputs_df = df.drop(columns=target_var)

## Split Dataset
Set aside hold out test set for later cross validation

In [4]:
train_idxs, test_idxs = train_test_split(list(range(len(df))), 
                                         shuffle=True, 
                                         test_size=100)
train_in_df, test_in_df = inputs_df.iloc[train_idxs], inputs_df.iloc[test_idxs]
train_scores, test_scores = final_scores[train_idxs], final_scores[test_idxs]

## Build Data Pipeline
Build a Data Pipeline to preprocess the dataset:
- numeric features - feature scale using Z-score transform
- categorical features - feature extraction by one hot encoding
- binary features - feature extraction by ordinal encoding



In [5]:
pipeline = ColumnTransformer([
    ("num_pipeline", StandardScaler(), num_features),
    ("cat_pipeline", OneHotEncoder(), cat_features),
    ("bin_pipeline", OrdinalEncoder(), bin_features)
])

## Prepare Data
Use data pipeline to prepare data

In [6]:
pipeline.fit(train_in_df)
train_data = pipeline.transform(train_in_df)
test_data = pipeline.transform(test_in_df)

### Dimensionality Reducation
Apply PCA to reduce no. of features, perserving 99% of the variance

In [7]:
pca = PCA(n_components=0.99, svd_solver="full")
pca.fit(train_data)
train_pca = pca.transform(train_data)
test_pca = pca.transform(test_data)

In [8]:
print(f"no. of PCA components: {train_data.shape[-1]}")

no. of PCA components: 45


## Commit Data
Commit prepared data and data pipeline

In [9]:
np.savez_compressed(os.path.join("build", "dataprep.npz"),
                    train_data=train_data,
                    test_data=test_data,
                    train_pca=train_pca,
                    test_pca=test_pca,
                    train_scores=train_scores,
                    test_scores=test_scores,
                    train_idxs=train_idxs, test_idxs=test_idxs)

In [10]:
joblib.dump({
    "pca": pca,
    "pipeline": pipeline
}, os.path.join("build","dataprep.joblib"))

['build/dataprep.joblib']