# Classification Flow  
Michael Massone & Nelson Farrell  
Final Project Phase 2  
DS 5220  
Stevin Morin PhD.  


This notebook contains final project step $4$
___

### Import Packages

In [28]:
import pandas as pd
import numpy as np
import sys
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from pathlib import Path
import os

### Set Up ```sys``` Path to Enable ```.py``` Imports

In [29]:
path = Path.cwd()
path_to_project_directory = path.parent
sys.path.insert(1, str(path_to_project_directory))
print(f"The working directory has been set to: {str(path_to_project_directory)}")

The working directory has been set to: /Users/nelsonfarrell/Documents/Northeastern/5220/final_project


### Import Python Modules

In [30]:
from modules.phase1_utils import * 

### Helper Functions

In [31]:
pass

___
### Parameters

In [32]:
# Paths
path_to_data_folder = path_to_project_directory / "data"
path_to_kaggle_data_folder = path_to_data_folder / "raw_from_kaggle"
path_split_data_folder = path_to_data_folder / "data_splits"

# File names
kaggle_data_file_name = "Train.csv"

# Other
target_attr = "Segmentation"
test_size = 0.20
train_test_split_random_state = 42
missingness_threshold = 0.20
split_folder = str(path_split_data_folder) + "/"
nominal_imputer_strategy = "most_frequent"
numerical_imputer_strategy = "mean"

___
### Set Up Timer

In [33]:
start = time.time()

___
### Read In Data

In [34]:
df = pd.read_csv(str(path_to_kaggle_data_folder) + "/" + kaggle_data_file_name)

___
### Inspect Data Size

In [35]:
print(f"Size of the dataset: {df.shape}")
df.head()

Size of the dataset: (8068, 11)


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


___
### Check for Missingness in the Target

In [36]:
num_rows_train_df_pre = df.shape[0]
print(f"The shape of train set PRE to dropping rows where the target missing: {df.shape}")
df = df.dropna(subset = target_attr)
num_rows_train_df_post = df.shape[0]
print(f"The shape of train set POST to dropping rows where the target missing: {df.shape}")
print(f"Number of rows dropped: {num_rows_train_df_pre - num_rows_train_df_post}")

The shape of train set PRE to dropping rows where the target missing: (8068, 11)
The shape of train set POST to dropping rows where the target missing: (8068, 11)
Number of rows dropped: 0


___
### Perform Train/Test Split

In [37]:
return_dict = perform_the_train_test_split(
                                        df = df,
                                        test_size = test_size, 
                                        train_test_split_random_state = train_test_split_random_state,
                                        split_folder = split_folder,
                                        prefix = None, 
                                        val = None, 
                                        stratify = True
                                        )
train_cap_x_df = return_dict["train_cap_x_df"]
train_y_df = return_dict["train_y_df"]

*************************

df.shape:
(8068, 11)

target class fractional balance:
Segmentation
D    0.281111
A    0.244422
C    0.244175
B    0.230293
Name: count, dtype: float64

*************************

train_df.csv:
(6454, 10) (6454, 1)

target class fractional balance:
Segmentation
D    0.281066
A    0.244500
C    0.244190
B    0.230245
Name: count, dtype: float64

*************************

test_df.csv
(1614, 10) (1614, 1)

target class fractional balance:
Segmentation
D    0.281289
A    0.244114
C    0.244114
B    0.230483
Name: count, dtype: float64


___
### Perform Train Validation Split

In [38]:
# This will implemented later
pass

___
### Check Attribute ```dtypes```

In [39]:
train_cap_x_df.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

___
### Identify Attributes Above ```Missingness``` Threshold

In [40]:
missingness_results_dict = get_missingness(train_cap_x_df, missingness_threshold)

ID missingness = 0.0
Gender missingness = 0.0
Ever_Married missingness = 0.017198636504493336
Age missingness = 0.0
Graduated missingness = 0.00914161760148745
Profession missingness = 0.01642392314843508
Work_Experience missingness = 0.10009296560272699
Spending_Score missingness = 0.0
Family_Size missingness = 0.040904865199876045
Var_1 missingness = 0.009296560272699102

missingness_drop_list:
[]


___
### Identify Non-Machine Learning Attributes

These attributes do not contain information that will benefit a machine learning algorithm.
1. Identification columns
2. Columns with very low variance, i.e., $0.0$

In [41]:
non_ml_attributes_results = separate_unique_columns(train_cap_x_df)

*****************************
non_ML_attr:
ID

*****************************
ML_attr:
Gender
Ever_Married
Age
Graduated
Profession
Work_Experience
Spending_Score
Family_Size
Var_1

*****************************
non_ml_attribute_list:
['ID']


___
### Identify Attributes to Exclude from Machine Learning

In [42]:
train_cap_x_df.columns

Index(['ID', 'Gender', 'Ever_Married', 'Age', 'Graduated', 'Profession',
       'Work_Experience', 'Spending_Score', 'Family_Size', 'Var_1'],
      dtype='object')

In [43]:
ml_attributes_drop_list = [] # None were identified

___
### Establish Machine Learning Attribute Configuration

In [44]:
ml_ignore_list = missingness_results_dict["missingness_drop_list"] \
                 + non_ml_attributes_results["non_ML_attr"] \
                 + ml_attributes_drop_list

print(f"These attributes will be ignored by the machine learning algorithm:")
for idx, attr in enumerate(ml_ignore_list):
    print(f"\t{idx + 1}. {attr}")

These attributes will be ignored by the machine learning algorithm:
	1. ID


In [45]:
train_cap_x_df.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
dtype: object

In [46]:
# Set the numerical attributes list
numerical_attr = ["Age", "Work_Experience", "Family_Size"]

# Set the nominal attributes list
nominal_attr = [attr for attr in train_cap_x_df.columns if attr not in numerical_attr and attr not in ml_ignore_list]

# Confirm all attributes are accounted for
assert (train_cap_x_df.shape[1] == len(numerical_attr) + len(nominal_attr) + len(ml_ignore_list))
print(f"All attributes in the train set have been accounted for.")
print()

print("ML Ignore List:")
for idx, attr in enumerate(ml_ignore_list):
    print(f"{idx + 1}. {attr}")
print()

print("Numerical Attributes:")
for idx, attr in enumerate(numerical_attr):
    print(f"{idx + 1}. {attr}")
print()

print("Nominal Attributes:")
for idx, attr in enumerate(nominal_attr):
    print(f"{idx + 1}. {attr}")
print()

print(f"Total Number of ML Attributes: {len(nominal_attr) + len(numerical_attr)}")


All attributes in the train set have been accounted for.

ML Ignore List:
1. ID

Numerical Attributes:
1. Age
2. Work_Experience
3. Family_Size

Nominal Attributes:
1. Gender
2. Ever_Married
3. Graduated
4. Profession
5. Spending_Score
6. Var_1

Total Number of ML Attributes: 9


___
### Build Composite Estimators

#### Define Estimators Involved in Experiment

In [47]:
# This will be implemented later
pass

#### Numerical Transformer
Transformations Performed:
1. Imputation
2. Normalization

In [48]:
numerical_transformer = Pipeline(
                            steps = [("imputer", SimpleImputer(strategy = numerical_imputer_strategy)),
                                     ("scaler", StandardScaler())]
                                )

#### Nominal Transformer 

Transformations Performed:
1. Imputation
2. TargetEncoding

In [49]:
nominal_transformer = Pipeline(
                        steps = [("imputer", SimpleImputer(strategy = nominal_imputer_strategy)),
                                 ("te", TargetEncoder())] 
                              )


#### Preprocessor

In [50]:
preprocessor = ColumnTransformer(
                        transformers = [("nominal", nominal_transformer, nominal_attr),
                                        ("numerical", numerical_transformer, numerical_attr)]
                                )

### END FLOW  
___

### Out of Flow Inspection

### Inspect ```train_cap_x_df```

In [51]:
# Display train data before transformations
train_cap_x_df

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
917,465905,Female,No,32,Yes,Artist,9.0,Low,1.0,Cat_6
3398,462903,Male,Yes,72,Yes,Entertainment,,Average,2.0,Cat_6
2045,467901,Female,No,33,Yes,Entertainment,1.0,Low,4.0,Cat_6
8060,463613,Female,Yes,48,Yes,Artist,0.0,Average,6.0,Cat_6
4604,459859,Female,Yes,28,No,Doctor,9.0,Low,1.0,Cat_7
...,...,...,...,...,...,...,...,...,...,...
3822,463101,Female,No,27,No,Homemaker,8.0,Low,1.0,Cat_6
5864,467844,Male,No,37,Yes,Healthcare,0.0,Low,2.0,Cat_6
3589,460706,Female,No,27,No,Engineer,6.0,Low,6.0,Cat_4
1489,464339,Male,No,26,No,Artist,0.0,Low,2.0,Cat_6


### Fit Transformer to ```train_cap_x_df```

In [52]:
# Fit the transformer
transformed_data = preprocessor.fit_transform(train_cap_x_df, train_y_df)

### Inspect ```transformed_data```

In [53]:
# Display data post transformations
pd.DataFrame(transformed_data)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.243244,0.239439,0.251686,0.265631,0.247608,0.149167,0.120971,0.482195,0.247163,0.264938,...,0.186572,0.135702,0.401234,0.231499,0.233558,0.288986,0.245955,-0.695320,1.942754,-1.227022
1,0.245438,0.222543,0.237927,0.294092,0.242267,0.285554,0.328199,0.143954,0.247163,0.264938,...,0.285090,0.457601,0.076363,0.231499,0.233558,0.288986,0.245955,1.703982,0.000000,-0.560068
2,0.244058,0.240658,0.251695,0.263589,0.247837,0.145081,0.119631,0.487389,0.246470,0.266102,...,0.182932,0.133865,0.404032,0.234783,0.237108,0.284869,0.243238,-0.635337,-0.513120,0.773838
3,0.247053,0.237791,0.254625,0.260530,0.243665,0.289776,0.328103,0.138429,0.245474,0.266634,...,0.298813,0.451700,0.074576,0.232011,0.236686,0.284071,0.247230,0.264401,-0.820105,2.107745
4,0.247053,0.237791,0.254625,0.260530,0.243665,0.289776,0.328103,0.138429,0.242648,0.168093,...,0.180810,0.136181,0.404898,0.244097,0.251789,0.220645,0.283446,-0.935250,1.942754,-1.227022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6449,0.247053,0.237791,0.254625,0.260530,0.245563,0.142475,0.120418,0.491482,0.242648,0.168093,...,0.180810,0.136181,0.404898,0.232011,0.236686,0.284071,0.247230,-0.995233,1.635769,-1.227022
6450,0.242197,0.223899,0.235382,0.298522,0.245563,0.142475,0.120418,0.491482,0.245474,0.266634,...,0.180810,0.136181,0.404898,0.232011,0.236686,0.284071,0.247230,-0.395407,-0.820105,-0.560068
6451,0.240253,0.239828,0.254922,0.264996,0.242106,0.154094,0.117737,0.486003,0.244560,0.168937,...,0.185932,0.133142,0.404753,0.308695,0.217910,0.088121,0.385106,-0.995233,1.021801,2.107745
6452,0.248289,0.222025,0.234620,0.295065,0.242106,0.154094,0.117737,0.486003,0.244560,0.168937,...,0.185932,0.133142,0.404753,0.228281,0.235933,0.285995,0.249788,-1.055215,-0.820105,-0.560068


### Inspect ```processor```

In [54]:
preprocessor