# Feature Engineering

In [1]:
# Import libraries
# type: ignore
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Read data
def read_data(dataset):
    df = pd.read_csv(f'../Data/raw/{dataset}-data.csv')
    return df

train_data = read_data('train')
val_data = read_data('validation')
test_data = read_data('test')

In [3]:
## Function to create a new features.
def add_features(data):

    # credit_scores
    credit_bins = [299, 579, 669, 739, 799, 850]
    credit_labels = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
    data['credit_score_label'] = pd.cut(data['credit_score'], bins=credit_bins, labels=credit_labels)
    # Age
    age_bins = [17, 30, 40, 55, 70, 100]
    age_labels = ['18-30', '31-40', '41-55', '56-70', '>70']
    data['age_label'] = pd.cut(data.age, bins=age_bins, labels=age_labels)
    data['age_young'] = (data.age < 35).astype('int')
    data['age_old'] = (data.age > 60).astype('int')

    # Age proportion based on train data
    age_proportion_dict = (round(train_data['age_label'].value_counts(normalize=True) * 100)).to_dict()
    data['age_label_proportion'] = data.age_label.map(age_proportion_dict).astype('int')

    # Average Credit score by age groups
    avg_credit_score = train_data.groupby(by=['age_label'], observed=False)['credit_score'].mean().to_dict()
    data['avg_credit_score_by_age'] = data.age_label.map(avg_credit_score)

    # Balance
    data['zero_balance'] = (data.balance == 0).astype('int')
    
    # Tenure 
    data['new_customer'] = (data.tenure == 0).astype('int')
    data['old_customer'] = (data.tenure > 8).astype('int')

    # Number of products
    data['single_product'] = (data.num_of_products == 1).astype('int')
    data['mt3_product'] = (data.num_of_products > 3).astype('int')

    # Gender
    data['is_female'] = (data.gender.str.lower() == 'female').astype(int)
        

In [4]:
add_features(train_data)
add_features(val_data)
add_features(test_data)

## Feture Selection

In [5]:
train_data.columns

Index(['customer_id', 'surname', 'credit_score', 'geography', 'gender', 'age',
       'tenure', 'balance', 'num_of_products', 'has_cr_card',
       'is_active_member', 'salary', 'churn', 'credit_score_label',
       'age_label', 'age_young', 'age_old', 'age_label_proportion',
       'avg_credit_score_by_age', 'zero_balance', 'new_customer',
       'old_customer', 'single_product', 'mt3_product', 'is_female'],
      dtype='object')

In [6]:
# Feature selection
num_features = ['credit_score', 'age', 'tenure', 'num_of_products', 'balance',
                 'salary', 'age_label_proportion', 'avg_credit_score_by_age']
cat_features = ['geography', 'credit_score_label', 'age_label']

new_features = ['has_cr_card', 'is_active_member', 'age_young', 'age_old',
                'zero_balance', 'new_customer', 'old_customer', 'single_product',
                'mt3_product', 'is_female']

features = num_features + cat_features + new_features

target = 'churn'

In [7]:
# Split the data into X and y
X_train = train_data[features]
y_train = train_data[target]

## Feature Transformation

In [9]:
# Transform numeric and categorical features
def scale_data(scaler, data, num_features):

    # Get the feature names
    features = scaler.get_feature_names_out()
    features = map(str.lower, features)
    scaled_data = scaler.transform(data[num_features])
    
    # Create dataframe
    scaled_df = pd.DataFrame(scaled_data, columns=features)
    return scaled_df

def encode_data(encoder, data, cat_features):

    # Get the feature names
    features = encoder.get_feature_names_out()
    features = map(str.lower, features)
    encoded_data = encoder.transform(data[cat_features])

    # Create dataframe
    encoded_df = pd.DataFrame(encoded_data, columns=features, dtype=int)
    return encoded_df

In [10]:
# Initialized scaler 
scaler = StandardScaler()
scaler = scaler.fit(X_train[num_features])

# Initalized one-hot encoder
oh_en = OneHotEncoder(sparse_output=False)
oh_en = oh_en.fit(X_train[cat_features])

In [11]:
def transform_data(data):
    scaled_data = scale_data(scaler, data, num_features)
    oh_en_data  = encode_data(oh_en, data, cat_features)

    data_transform = pd.concat([scaled_data, oh_en_data, data[new_features]], axis=1)
    return data_transform

In [12]:
# Transform training data
X_train_transform = transform_data(X_train)

# Validation data
X_val = val_data[features]
y_val = val_data[target]
X_val_transform = transform_data(X_val)

# Test data
X_test = test_data[features]
y_test = test_data[target]
X_test_transform = transform_data(X_test)

In [13]:
# Shape of new transformed dataset
X_train_transform.shape, X_val_transform.shape, X_test_transform.shape

((6000, 31), (2000, 31), (2000, 31))

In [14]:
# Save to csv
def save_tocsv(data, filename):
    data.to_csv(f'../Data/process/{filename}.csv', index=False)

In [15]:
# Save transformed datasets
save_tocsv(X_train_transform, 'X_train_transformed')
save_tocsv(X_val_transform, 'X_validation_transformed')
save_tocsv(X_test_transform, 'X_test_transformed')

# Save target datasets
save_tocsv(y_train, 'y_train')
save_tocsv(y_val, 'y_validation')
save_tocsv(y_test, 'y_test')

In [16]:
# Save scaler and encoder model
import pickle
with open('../Model/transformer.pkl', 'wb+') as f:
    pickle.dump((scaler, oh_en), f)