## Classification Flow  
This notebook contains a machine learning classification flow


### Modules/Packages

In [1]:
import os
import sys
import time
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split

### Set Import Path

In [2]:
path = Path.cwd()
path_to_project_directory = path.parent
sys.path.insert(1, str(path_to_project_directory))
print(f"The path the project directory has been set to: {str(path_to_project_directory)}")

The path the project directory has been set to: /Users/mikey/Library/Mobile Documents/com~apple~CloudDocs/Code/roux_class_files/DS5220/final_project/customer-segmentation


### Import Modules

In [3]:
import modules.phase1_utils as utils 

### Helper Functions

In [4]:
# Functions
pass

### Parameters

In [5]:
# Paths
path_to_data_folder = path_to_project_directory / "data"
path_to_kaggle_data_folder = path_to_data_folder / "raw_from_kaggle"
path_split_data_folder = path_to_data_folder / "data_splits"

# File names
kaggle_data_file_name = "Train.csv"

# Other
target_attr = "Segmentation"
test_size = 0.20
train_test_split_random_state = 42
missingness_threshold = 0.20
split_folder = str(path_split_data_folder) + "/"

### Set Timer

In [6]:
start = time.time()

### Read in Raw Data

In [7]:
df = pd.read_csv(str(path_to_kaggle_data_folder) + "/" + kaggle_data_file_name)

### Inspect Data Size

In [8]:
print(f"Size of the dataset: {df.shape}")
df.head()

Size of the dataset: (8068, 11)


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


### Inspect Missingness in Target Vector

In [9]:
print(f"Dataset size pre dropping observation with missing target: {df.shape}")
df = df.dropna(subset = target_attr)
print(f"Dataset size post dropping observation with missing target: {df.shape}")

Dataset size pre dropping observation with missing target: (8068, 11)
Dataset size post dropping observation with missing target: (8068, 11)


### Perform Test Train Split

In [10]:
utils.perform_the_train_test_split(df = df,
                                   target_attr = target_attr, 
                                   test_size = test_size, 
                                   train_test_split_random_state = train_test_split_random_state,
                                   split_folder = split_folder,
                                   prefix = None, 
                                   val = None, 
                                   stratify = True)

*************************

df.shape:
(8068, 11)

target class fractional balance:
Segmentation
D    0.281111
A    0.244422
C    0.244175
B    0.230293
Name: count, dtype: float64

*************************

train_df.csv:
(6454, 10) (6454, 1)

target class fractional balance:
Segmentation
D    0.281066
A    0.244500
C    0.244190
B    0.230245
Name: count, dtype: float64

*************************

test_df.csv
(1614, 10) (1614, 1)

target class fractional balance:
Segmentation
D    0.281289
A    0.244114
C    0.244114
B    0.230483
Name: count, dtype: float64


{'train_cap_x_df':           ID  Gender Ever_Married  Age Graduated     Profession  \
 917   465905  Female           No   32       Yes         Artist   
 3398  462903    Male          Yes   72       Yes  Entertainment   
 2045  467901  Female           No   33       Yes  Entertainment   
 8060  463613  Female          Yes   48       Yes         Artist   
 4604  459859  Female          Yes   28        No         Doctor   
 ...      ...     ...          ...  ...       ...            ...   
 3822  463101  Female           No   27        No      Homemaker   
 5864  467844    Male           No   37       Yes     Healthcare   
 3589  460706  Female           No   27        No       Engineer   
 1489  464339    Male           No   26        No         Artist   
 2661  459407  Female           No   37       Yes         Doctor   
 
       Work_Experience Spending_Score  Family_Size  Var_1  
 917               9.0            Low          1.0  Cat_6  
 3398              NaN        Average       