# Data Transformation

In [1]:
# Import libraries
# type: ignore
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Read data
def read_data(dataset):
    df = pd.read_csv(f'../Data/raw/{dataset}-data.csv')
    return df

train_data = read_data('train')
valid_data = read_data('validation')
test_data = read_data('test')

## Feauture Engineering

In [3]:
## Function to create a new features.
def add_features(data):

    # credit_scores
    credit_bins = [299, 579, 669, 739, 799, 850]
    credit_labels = ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent']
    data['credit_score_label'] = pd.cut(data['credit_score'], bins=credit_bins, labels=credit_labels)
    # Age
    age_bins = [17, 30, 40, 55, 70, 100]
    age_labels = ['18-30', '31-40', '41-55', '56-70', '>70']
    data['age_label'] = pd.cut(data.age, bins=age_bins, labels=age_labels)
    data['age_young'] = (data.age < 35).astype('int')
    data['age_old'] = (data.age > 60).astype('int')

    # Age proportion based on train data
    age_proportion_dict = (round(train_data['age_label'].value_counts(normalize=True) * 100)).to_dict()
    data['age_label_proportion'] = data.age_label.map(age_proportion_dict).astype('int')

    # Average Credit score by age groups
    avg_credit_score = train_data.groupby(by=['age_label'], observed=False)['credit_score'].mean().to_dict()
    data['avg_credit_score_by_age'] = data.age_label.map(avg_credit_score)

    # Balance
    data['zero_balance'] = (data.balance == 0).astype('int')
    
    # Tenure 
    data['new_customer'] = (data.tenure == 0).astype('int')
    data['old_customer'] = (data.tenure > 8).astype('int')

    # Number of products
    data['single_product'] = (data.num_of_products == 1).astype('int')
    data['mt3_product'] = (data.num_of_products > 3).astype('int')

    # Gender
    data['is_female'] = (data.gender.str.lower() == 'female').astype(int)
        

In [4]:
add_features(train_data)
add_features(valid_data)
add_features(test_data)

### Feture Selection

In [5]:
# Feature selection
num_features = ['credit_score', 'age', 'tenure', 'num_of_products', 'balance',
                 'salary', 'age_label_proportion', 'avg_credit_score_by_age']
cat_features = ['geography', 'credit_score_label', 'age_label']

new_features = ['has_cr_card', 'is_active_member', 'age_young', 'age_old',
                'zero_balance', 'new_customer', 'old_customer', 'single_product',
                'mt3_product', 'is_female']

features = num_features + cat_features + new_features

target = 'churn'

### Feature Transformation

In [6]:
# Apply transformation to features
def get_scaled_data(scaler, data, num_features):
    # Apply scaling transformation
    scaled_data = scaler.transform(data[num_features])
    # Save result in dataframe
    scaled_df = pd.DataFrame(scaled_data, columns=num_features)
    return scaled_df

def get_encoded_data(encoder, data, cat_featues):
    # Get the feature names
    features = encoder.get_feature_names_out()
    features = map(str.lower, features)
    # Apply one-hot encoder
    encoded_data = encoder.transform(data[cat_features])

    # Create dataframe
    encoded_df = pd.DataFrame(encoded_data, columns=features, dtype=int)
    return encoded_df

def transform(data):
    # Apply transformation to data
    scaled_df = get_scaled_data(scaler, data, num_features)
    oh_en_df  = get_encoded_data(oh_en, data, cat_features)

    # Combine transformed dataframes along with newly added features.
    transform_data = pd.concat([scaled_df, oh_en_df, data[new_features], data[target]], axis=1)
    return transform_data

In [8]:
# For numerical feature transformation
scaler = StandardScaler()
scaler.fit(train_data[num_features])

# For categorical feature transformation
oh_en = OneHotEncoder(sparse_output=False)
oh_en = oh_en.fit(train_data[cat_features])

train_transform = transform(train_data)
valid_transform = transform(valid_data)
test_transform = transform(test_data)

In [9]:
# Save to csv
def save_to_csv(data, filename):
    data.to_csv(f'../Data/process/{filename}.csv', index=False)

In [12]:
# Save transformed datasets
save_to_csv(train_transform, 'train_transform')
save_to_csv(valid_transform, 'valid_transform')
save_to_csv(test_transform, 'test_transform')

In [13]:
# Save scaler and encoder model
import pickle
with open('../Model/transformer.pkl', 'wb+') as f:
    pickle.dump((scaler, oh_en), f)