# Week 12 - Save and package your model for deployment

In [15]:
import pandas as pd
import numpy as np
# import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
import pickle

In [2]:
pd.set_option('display.max_columns', None)

train = pd.read_csv('/home/jupyter-fagundem/applied_analytics_project/data/raw/train_final.csv', low_memory=False)
validation = pd.read_csv('/home/jupyter-fagundem/applied_analytics_project/data/raw/val_set_final.csv')
test = pd.read_csv('/home/jupyter-fagundem/applied_analytics_project/data/raw/test_4_11.csv')

Changing columns name and dropping columns so both datasets are the same

In [3]:
train = train.rename(columns={'country': 'country_spain'})

In [4]:
train = train.drop(columns=['Unnamed: 0'])
validation = validation.drop(columns=['Unnamed: 0'])
drop = ['join_channel', 'province_name', 'employee_index', 'segment', 'total_products']
train = train.drop(columns=drop)
validation = validation.drop(columns=drop + ['payroll_acct.1', 'first_contract_date', 'primary_cust', 'last_date_primary', 'deceased'])

test = test.drop(columns=['Unnamed: 0'])
test = test.drop(columns=drop + ['payroll_acct.1'])

## Reading into the data

Setting products we want to predict

In [5]:
products = ['savings_acct', 'guarantees', 'current_acct',
       'derivada_acct', 'payroll_acct', 'junior_acct', 'mas_particular_acct',
       'particular_acct', 'particular_plus_acct', 'short_term_depo',
       'medium_term_depo', 'long_term_depo', 'e_acct', 'funds', 'mortgage',
       'pension', 'loans', 'taxes', 'credit_card', 'securities', 'home_acct',
       'pensions_2', 'direct_debt']

# Pre-processing

Defining our Xs and Ys

In [6]:
train_2 = train.copy()
test_2 = test.copy()

# Transformation #2  

For tranformation #2 we will add the date column as one of the features. For that, we will calculate the time since purchase using the month we are trying to predict on June 2016. For this transformation to make sense, we will also keep the first transformation, since the time line of purchase matters now, we will keep the duplicate clients' purchases instead of only keeping the last one

In [7]:
train_2['date'] = pd.to_datetime(train_2['date'], format='%Y-%m-%d')

train_2['date'] = train_2['date'].dt.to_period('M').dt.to_timestamp()

# Setting our prediction date, June 28, 2016, as the reference date
reference_date = pd.to_datetime("2016-06-28")

# Calculate time since purchase
train_2['months_since_purchase'] = (reference_date.year - train_2['date'].dt.year) * 12 + \
                                   (reference_date.month - train_2['date'].dt.month)

print(train_2[['date', 'months_since_purchase']])

              date  months_since_purchase
0       2016-04-01                      2
1       2015-07-01                     11
2       2016-04-01                      2
3       2015-08-01                     10
4       2016-03-01                      3
...            ...                    ...
5757281 2016-05-01                      1
5757282 2015-08-01                     10
5757283 2015-11-01                      7
5757284 2016-05-01                      1
5757285 2016-01-01                      5

[5757286 rows x 2 columns]


In [8]:
# Adding feature on test dateased
test_2['date'] = pd.to_datetime(test_2['date'], format='%Y-%m-%d')
test_2['date'] = test_2['date'].dt.to_period('M').dt.to_timestamp()

test_2['months_since_purchase'] = (reference_date.year - test_2['date'].dt.year) * 12 + \
                              (reference_date.month - test_2['date'].dt.month)

print(test_2[['date', 'months_since_purchase']])

              date  months_since_purchase
0       2015-06-01                     12
1       2016-02-01                      4
2       2015-07-01                     11
3       2016-03-01                      3
4       2016-02-01                      4
...            ...                    ...
1236739 2016-02-01                      4
1236740 2016-02-01                      4
1236741 2015-08-01                     10
1236742 2016-05-01                      1
1236743 2016-04-01                      2

[1236744 rows x 2 columns]


In [9]:
X_train_2 = train_2.drop(['customer_code', 'date'] + products, axis=1)
y_train_2 = train_2[products]

X_test_2 = test_2.drop(['customer_code', 'date'] + products, axis=1)
y_test_2 = test_2[products]

## Training

In [10]:
# Defining the best training parameter
params = {'C': 10, 'solver': 'liblinear', 'max_iter': 300}

Database with second transformation

In [11]:
# Initialize dictionary for storing metrics
metrics = defaultdict(lambda: defaultdict(dict))

trained_models = {}

# Train and evaluate the model on the 'train_2' dataset
for product in products:
    clf = LogisticRegression(**params)
    
    # Train data and labels for each product
    y_train_2_product = y_train_2[product].values
    y_test_2_product = y_test_2[product].values
    
    # Train the model
    clf.fit(X_train_2, y_train_2_product)

    # Sacing the model to the dictionary
    trained_models[product] = clf
    
    # Predictions
    y_train_2_pred = clf.predict(X_train_2)
    y_test_2_pred = clf.predict(X_test_2)
    y_train_2_pred_proba = clf.predict_proba(X_train_2)[:, 1]
    y_test_2_pred_proba = clf.predict_proba(X_test_2)[:, 1]
    
    # Calculate metrics
    metrics['train_2']['train'][product] = {
        'ROC AUC': roc_auc_score(y_train_2_product, y_train_2_pred_proba),
        'F1 Score': f1_score(y_train_2_product, y_train_2_pred),
        'Confusion Matrix': confusion_matrix(y_train_2_product, y_train_2_pred)
    }
    
    metrics['train_2']['test'][product] = {
        'ROC AUC': roc_auc_score(y_test_2_product, y_test_2_pred_proba),
        'F1 Score': f1_score(y_test_2_product, y_test_2_pred),
        'Confusion Matrix': confusion_matrix(y_test_2_product, y_test_2_pred)
    }


In [27]:
metrics_dict = dict(metrics)

In [12]:
# Summarize the average metrics across all products
summary_data_2 = []
for dataset in ['train', 'test']:
    avg_roc_auc = np.mean([metrics['train_2'][dataset][p]['ROC AUC'] for p in products])
    avg_f1 = np.mean([metrics['train_2'][dataset][p]['F1 Score'] for p in products])
    summary_data_2.append(['train_2', dataset, avg_roc_auc, avg_f1])

# Create summary DataFrame
summary_df_2 = pd.DataFrame(summary_data_2, columns=['Dataset', 'Type', 'Avg ROC AUC', 'Avg F1 Score'])
print("\nEvaluated Model on Dataset: train_2")
print(summary_df_2.to_string(index=False))



Evaluated Model on Dataset: train_2
Dataset  Type  Avg ROC AUC  Avg F1 Score
train_2 train     0.885926      0.111536
train_2  test     0.883623      0.212467


## Pickle the model

In [32]:
with open('multi_label_metrics.pkl', 'wb') as metrics_file:
    pickle.dump(metrics_dict, metrics_file)

with open('summary_df.pkl', 'wb') as summary_file:
    pickle.dump(summary_df_2, summary_file)

In [None]:
# Load pickle files
with open('multi_label_metrics.pkl', 'rb') as metrics_file:
    loaded_metrics = pickle.load(metrics_file)

with open('summary_df.pkl', 'rb') as summary_file:
    loaded_summary_df = pickle.load(summary_file)

# Print results
print("Loaded Metrics for Individual Products:")
print(loaded_metrics)
print("\n Summary DataFrame:")
print(loaded_summary_df.to_string(index=False))

Loaded Metrics for Individual Products:
{'train_2': defaultdict(<class 'dict'>, {'train': {'savings_acct': {'ROC AUC': 0.8709668741130299, 'F1 Score': 0.0, 'Confusion Matrix': array([[5756696,       0],
       [    590,       0]], dtype=int64)}, 'guarantees': {'ROC AUC': 0.9693476237983898, 'F1 Score': 0.0, 'Confusion Matrix': array([[5757167,       0],
       [    119,       0]], dtype=int64)}, 'current_acct': {'ROC AUC': 0.7465365034040649, 'F1 Score': 0.789796652125073, 'Confusion Matrix': array([[1154329, 1042230],
       [ 556769, 3003958]], dtype=int64)}, 'derivada_acct': {'ROC AUC': 0.8791666642562761, 'F1 Score': 0.0, 'Confusion Matrix': array([[5755008,       0],
       [   2278,       0]], dtype=int64)}, 'payroll_acct': {'ROC AUC': 0.8638669338040359, 'F1 Score': 0.0008201085725826688, 'Confusion Matrix': array([[5430634,      85],
       [ 326433,     134]], dtype=int64)}, 'junior_acct': {'ROC AUC': 0.9995948324888868, 'F1 Score': 0.8910813874404896, 'Confusion Matrix': arra