# Prepare the Data
This notebook contains final project step $4$
___

### Import Packages

In [39]:
import pandas as pd
import numpy as np
import sys
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time
from pathlib import Path
import os

### Set Up ```sys``` Path to Enable ```.py``` Imports

In [40]:
path = Path.cwd()
path_to_project_directory = path.parent
sys.path.insert(1, str(path_to_project_directory))
print(f"The working directory has been set to: {str(path_to_project_directory)}")

The working directory has been set to: /Users/nelsonfarrell/Documents/Northeastern/5220/final_project


### Import Python Modules

In [41]:
from modules.phase1_utils import * 

### Helper Functions

In [42]:
pass

### Parameters

In [43]:
path_to_data_directory = "/data/data_splits/"
train_data_file_name = "train_df.csv"
target_attr = "Segmentation"
missingness_threshold = 0.20
nominal_imputer_strategy = "most_frequent"

### Set Up Timer

In [44]:
start = time.time()

___
### Read In Train Data

In [45]:
orig_df = pd.read_csv(str(path_to_project_directory) + path_to_data_directory + train_data_file_name)
train_df = orig_df.copy()
del orig_df

In [46]:
train_df.head()

Unnamed: 0,index,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,917,465905,Female,No,32,Yes,Artist,9.0,Low,1.0,Cat_6,A
1,3398,462903,Male,Yes,72,Yes,Entertainment,,Average,2.0,Cat_6,B
2,2045,467901,Female,No,33,Yes,Entertainment,1.0,Low,4.0,Cat_6,B
3,8060,463613,Female,Yes,48,Yes,Artist,0.0,Average,6.0,Cat_6,A
4,4604,459859,Female,Yes,28,No,Doctor,9.0,Low,1.0,Cat_7,A


___
### Check for Missingness in the Target

In [47]:
num_rows_train_df_pre = train_df.shape[0]
print(f"The shape of train set PRE to dropping rows where the target missing: {train_df.shape}")
train_df = train_df.dropna(subset = target_attr)
num_rows_train_df_post = train_df.shape[0]
print(f"The shape of train set POST to dropping rows where the target missing: {train_df.shape}")
print(f"Number of rows dropped: {num_rows_train_df_pre - num_rows_train_df_post}")

The shape of train set PRE to dropping rows where the target missing: (6454, 12)
The shape of train set POST to dropping rows where the target missing: (6454, 12)
Number of rows dropped: 0


___
### Perform Train/Test Split

In [48]:
# This has already been performed
pass

___
### Perform Train Validation Split

In [49]:
# This will implemented later
pass

___
### Check Attribute ```dtypes```

In [50]:
train_df.dtypes

index                int64
ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
Segmentation        object
dtype: object

___
### Identify Attributes Above ```Missingness``` Threshold

In [51]:
missingness_results_dict = get_missingness(train_df, missingness_threshold)

index missingness = 0.0
ID missingness = 0.0
Gender missingness = 0.0
Ever_Married missingness = 0.017198636504493336
Age missingness = 0.0
Graduated missingness = 0.00914161760148745
Profession missingness = 0.01642392314843508
Work_Experience missingness = 0.10009296560272699
Spending_Score missingness = 0.0
Family_Size missingness = 0.040904865199876045
Var_1 missingness = 0.009296560272699102
Segmentation missingness = 0.0

missingness_drop_list:
[]


___
### Identify Non-Machine Learning Attributes

These attributes do not contain information that will benefit a machine learning algorithm.
1. Identification columns
2. Columns with very low variance, i.e., $0.0$

In [52]:
non_ml_attributes_results = separate_unique_columns(train_df)

*****************************
non_ML_attr:
index
ID

*****************************
ML_attr:
Gender
Ever_Married
Age
Graduated
Profession
Work_Experience
Spending_Score
Family_Size
Var_1
Segmentation

*****************************
non_ml_attribute_list:
['index', 'ID']


___
### Identify Attributes to Exclude from Machine Learning

In [53]:
train_df.columns

Index(['index', 'ID', 'Gender', 'Ever_Married', 'Age', 'Graduated',
       'Profession', 'Work_Experience', 'Spending_Score', 'Family_Size',
       'Var_1', 'Segmentation'],
      dtype='object')

In [54]:
ml_attributes_drop_list = [] # None were identified

___
### Establish Machine Learning Attribute Configuration

In [55]:
ml_ignore_list = missingness_results_dict["missingness_drop_list"] \
                 + non_ml_attributes_results["non_ML_attr"] \
                 + ml_attributes_drop_list

print(f"These attributes will be ignored by the machine learning algorithm:")
for idx, attr in enumerate(ml_ignore_list):
    print(f"\t{idx + 1}. {attr}")

These attributes will be ignored by the machine learning algorithm:
	1. index
	2. ID


In [56]:
train_df.dtypes

index                int64
ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
Segmentation        object
dtype: object

In [57]:
# Set the numerical attributes list
numerical_attr = ["Age", "Work_Experience", "Family_Size"]

# Set the nominal attributes list
nominal_attr = [attr for attr in train_df.columns if attr not in numerical_attr and attr not in ml_ignore_list]

# Confirm all attributes are accounted for
assert (train_df.shape[1] == len(numerical_attr) + len(nominal_attr) + len(ml_ignore_list))
print(f"All attributes in the train set have been accounted for.")
print()

print("ML Ignore List:")
for idx, attr in enumerate(ml_ignore_list):
    print(f"{idx + 1}. {attr}")
print()

print("Numerical Attributes:")
for idx, attr in enumerate(numerical_attr):
    print(f"{idx + 1}. {attr}")
print()

print("Nominal Attributes:")
for idx, attr in enumerate(nominal_attr):
    print(f"{idx + 1}. {attr}")
print()

print(f"Total Number of ML Attributes: {len(nominal_attr) + len(numerical_attr)}")


All attributes in the train set have been accounted for.

ML Ignore List:
1. index
2. ID

Numerical Attributes:
1. Age
2. Work_Experience
3. Family_Size

Nominal Attributes:
1. Gender
2. Ever_Married
3. Graduated
4. Profession
5. Spending_Score
6. Var_1
7. Segmentation

Total Number of ML Attributes: 10


___
### Build Composite Estimators

#### Define Estimators Involved in Experiment

In [58]:
# This will be implemented later
pass

#### Numerical Transformer
Transformations Performed:
1. Imputation
2. Normalization

In [59]:
numerical_transformer = Pipeline(
                            steps = [("imputer", SimpleImputer()),
                                     ("scaler", StandardScaler())]
                                )

#### Nominal Transformer 

Transformations Performed:
1. Imputation
2. OneHotEncoding

In [60]:
nominal_transformer = Pipeline(
                        steps = [("imputer", SimpleImputer(strategy = nominal_imputer_strategy)),
                                 ("ohe", OneHotEncoder())] 
                              )


#### Preprocessor

In [61]:
preprocessor = ColumnTransformer(
                        transformers = [("nominal", nominal_transformer, nominal_attr),
                                        "numerical", numerical_transformer, numerical_attr]
                                )