# Prototyping

This notebook is a sandbox for experimenting with pieces of the pipeline.

In [1]:
import os
import pickle
from pprint import pprint

import numpy
import pandas as pd
import pandas_profiling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

from ml.data import process_data_for_training, process_data_for_inference
from ml.model import train_model, infer, compute_model_metrics
from train_model import \
    DATA_FILE_PATH, MODEL_FILE_PATH, LABEL, CATEGORICAL_FEATURES

os.environ['QT_QPA_PLATFORM'] = 'offscreen'

In [2]:
# Load cleaned data from file.
data_frame = pd.read_csv(DATA_FILE_PATH)
data_frame.shape

(32561, 15)

In [3]:
print("DataFrame head (1st 5 rows):")
data_frame.head()

DataFrame head (1st 5 rows):


Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data_frame_train, data_frame_test = train_test_split(
    data_frame,
    test_size=0.20,
)

In [5]:
X_train, y_train, input_encoder, label_binarizer = process_data_for_training(
    data_frame=data_frame_train,
    categorical_features=CATEGORICAL_FEATURES,
    label=LABEL,
)

In [6]:
X_test, y_test = process_data_for_inference(
    data_frame=data_frame_test,
    categorical_features=CATEGORICAL_FEATURES,
    input_encoder=input_encoder,
    label_binarizer=label_binarizer,
    label=LABEL,
)

In [7]:
# Train model.
model: RandomForestClassifier = train_model(X_train, y_train)

In [8]:
df: pd.DataFrame = data_frame_test[:3]
X: np.ndarray = X_test  # m x n float64 array.
y: np.ndarray = y_test  # 1D int64 array where elements are 0 or 1.