In [7]:
# Import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
# Read data
def read_data(filename):
    base_path = "../Data/raw/"
    return pd.read_csv(base_path + filename)

In [9]:
df_train = read_data('train.csv')
df_val = read_data('val.csv')
df_test = read_data('test.csv')

In [10]:
df_train.head()

Unnamed: 0,customer_id,surname,credit_score,geography,gender,age,tenure,balance,num_of_products,has_cr_card,is_active_member,salary,churn,age_group
0,15692132,wilkinson,717,spain,female,22,6,101060.25,1,0,1,84699.56,0,18-32
1,15684173,chang,687,spain,female,44,7,0.0,3,1,0,155853.52,1,33-47
2,15652999,milne,742,germany,male,33,1,137937.95,1,1,1,51387.1,0,33-47
3,15583090,komar,581,spain,female,29,8,0.0,2,1,0,46735.19,0,18-32
4,15678206,yeh,464,france,male,46,6,161798.53,1,1,0,182944.47,0,33-47


#### Feature selection

In [11]:
# select features
features  = ['credit_score', 'tenure', 'balance', 'num_of_products', 'salary', 'geography', 'gender',
             'has_cr_card', 'is_active_member','age_group']

num_features = ['credit_score', 'tenure', 'balance', 'num_of_products', 'salary']
cat_features = ['geography', 'gender','age_group']
target = 'churn'

In [12]:
# Split the data
def get_df_xy(data):
    return data[features].copy(), data[target].copy()
    
X_train, y_train = get_df_xy(df_train)
X_val, y_val = get_df_xy(df_val)
X_test, y_test = get_df_xy(df_val)

#### Data Transformation

In [13]:
# Define data preprocessing steps.
preprocessor = ColumnTransformer(
    transformers=[('num_scaler', StandardScaler(), num_features),
                  ('cat_oh_en',  OneHotEncoder(sparse_output=False), cat_features)
                 ],
    remainder = 'passthrough'
)

In [14]:
# Preprocess the data
preprocessor = preprocessor.fit(X_train)

# Transform the data
x_train_trans = preprocessor.transform(X_train)

In [15]:
# Get the feature names
preprocessor.get_feature_names_out()

array(['num_scaler__credit_score', 'num_scaler__tenure',
       'num_scaler__balance', 'num_scaler__num_of_products',
       'num_scaler__salary', 'cat_oh_en__geography_france',
       'cat_oh_en__geography_germany', 'cat_oh_en__geography_spain',
       'cat_oh_en__gender_female', 'cat_oh_en__gender_male',
       'cat_oh_en__age_group_18-32', 'cat_oh_en__age_group_33-47',
       'cat_oh_en__age_group_48-62', 'cat_oh_en__age_group_63-77',
       'cat_oh_en__age_group_78-92', 'remainder__has_cr_card',
       'remainder__is_active_member'], dtype=object)

In [16]:
x_train_transform = pd.DataFrame(data=preprocessor.transform(X_train), columns=preprocessor.get_feature_names_out(), index=X_train.index)
x_val_transform = pd.DataFrame(data=preprocessor.transform(X_val), columns=preprocessor.get_feature_names_out(), index=X_val.index)
x_test_transform = pd.DataFrame(data=preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out(), index=X_test.index)

In [17]:
# Save the data
def save_data(X, y, filename):
    # Define the filepath to save data.
    base_path = '../Data/process/'
    x_filepath = f"{base_path}X_{filename}.csv"
    y_filepath = f"{base_path}y_{filename}.csv"

    # Save to csv
    X.to_csv(x_filepath, index=False)
    y.to_csv(y_filepath, index=False)
    
save_data(x_train_transform, y_train, 'train')
save_data(x_test_transform, y_test, 'test')
save_data(x_val_transform, y_val, 'validation')

In [20]:
# Save the preprocessor transformer
import pickle
with open('../Model/processor.bin', 'wb') as f:
    pickle.dump(preprocessor, f)