# Introduction
Source: https://www.kaggle.com/competitions/elo-merchant-category-recommendation/data

### Table of Contents
- [Libraries](#libraries)
- [Utils](#utils)
- [Datasets](#datasets)
- [Machine Learning](#custom-model)

### Libraries <a id="libraries"></a>

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
%matplotlib inline

# Tools
import math
import datetime
from typing import List, Union

# ML Tools
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# Regression Models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

# CONSTANTS
SEED = 123
TEST_PERC = 0.05
INPUT_ELO_DIR = '/kaggle/input/elo-merchant-category-recommendation'
INPUT_PREPROCESSED_DIR = '/kaggle/input/cz4041-preprocessed'

import warnings
warnings.filterwarnings("ignore")

import unittest
import threading

np.random.seed(400)
random.seed(300)

### Utilts <a id="utils"></a>

In [2]:
def summarizeDF(df:DataFrame)->DataFrame:
    """This function shows a basic summary of the given dataframe.
    
    Parameters
    ----------
    df: pandas DataFrame
    This specifies the dataframe to be summarized.
    
    Returns
    -------
    pandas DataFrame: This is a table of summary of the given dataset.
    """    
    variables, dtypes, count, unique, missing, pc_missing = [], [], [], [], [], []
    
    for item in df.columns:
        variables.append(item)
        dtypes.append(df[item].dtype)
        count.append(len(df[item]))
        unique.append(len(df[item].unique()))
        missing.append(df[item].isna().sum())
        pc_missing.append(round((df[item].isna().sum() / len(df[item])) * 100, 2))

    output = pd.DataFrame({
        'column_name': variables, 
        'dtype': dtypes,
        'count': count,
        'unique': unique,
        'missing': missing, 
        'percentage_missing_data': pc_missing
    })    
        
    return output

def preprocess_data(df:DataFrame=None)->DataFrame:
    """This function preprocess the data into a specific form for the computation.
    Given a DataFrame (df), impute with mode.
    
    Parameters
    ----------
    df: pandas DataFrame
    This specifies the data to be preprocessed.
    
    Returns
    -------
    DataFrame: This specifies the preprocessed DataFrame.
    """
    if df is None:
        raise Exception("Expected a DataFrame, no DataFrame supplied.")
    
    df_copy = df.copy()
    for col in df.columns[df.isnull().any()]:
        df_copy[col].fillna(df_copy['card_id'].map(df_copy.groupby('card_id')[col].apply(lambda x: x.mode().iloc[0] if not x.isnull().all() else np.nan)).fillna(df_copy[col].mode().iloc[0]), inplace=True)

    return df_copy

def feature_engineering(df:DataFrame=None)->DataFrame:
    """This function perform feature engineering on the input Data"""
    
    def get_new_columns(name:str, aggs:list)->list: # Nested function for feature engineering
        """This function creates new column names for the aggregation of the features."""
        return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]
    
    # Make copy of df
    df_historical_transactions_copy = df.copy()
 
    # Convert DT columns to Pandas DT
    df_historical_transactions_copy['purchase_date'] = pd.to_datetime(df_historical_transactions_copy['purchase_date'])
 
    # Feature Engineer columns from purchase_date
    df_historical_transactions_copy['year'] = df_historical_transactions_copy['purchase_date'].dt.year
    df_historical_transactions_copy['weekofyear'] = df_historical_transactions_copy['purchase_date'].dt.isocalendar().week
    df_historical_transactions_copy['month'] = df_historical_transactions_copy['purchase_date'].dt.month
    df_historical_transactions_copy['dayofweek'] = df_historical_transactions_copy['purchase_date'].dt.dayofweek
    df_historical_transactions_copy['weekend'] = (df_historical_transactions_copy.purchase_date.dt.weekday >=5).astype(int)
    df_historical_transactions_copy['hour'] = df_historical_transactions_copy['purchase_date'].dt.hour
 
    # Encode Binary Features
    df_historical_transactions_copy['authorized_flag'] = df_historical_transactions_copy['authorized_flag'].map({"Y":1, 'N':0})
    df_historical_transactions_copy['category_1'] = df_historical_transactions_copy['category_1'].map({'Y':1, 'N':0})
 
    # Feature Engineer Month Diff/Lag
    df_historical_transactions_copy['month_diff'] = ((datetime.datetime.today() - df_historical_transactions_copy['purchase_date']).dt.days)//30
    df_historical_transactions_copy['month_diff'] += df_historical_transactions_copy['month_lag']
    
    # Getting Centrality of the Data
    aggs = {}
    for col in ['month','hour','weekofyear','dayofweek','year', 'state_id','subsector_id']:
        aggs[col] = ['nunique']
 
    # Feature Engineering using Univariate Analysis
    aggs['authorized_flag'] = ['sum', 'mean']
    aggs['card_id'] = ['size']
    aggs['category_1'] = ['sum', 'mean']
    aggs['installments'] = ['sum','max','min','mean','var']
    aggs['month_lag'] = ['max','min','mean','var']
    aggs['purchase_amount'] = ['sum','max','min','mean','var']
    aggs['purchase_date'] = ['max','min']
    aggs['month_diff'] = ['mean']
    aggs['weekend'] = ['sum', 'mean']
 
    for col in ['category_2','category_3']:
        df_historical_transactions_copy[col+'_mean'] = df_historical_transactions_copy.groupby([col])['purchase_amount'].transform('mean')
        aggs[col+'_mean'] = ['mean']    
 
    new_columns = get_new_columns('hist',aggs)
    
    # Group Aggregations by card_id
    df_historical_transactions_copy_group = df_historical_transactions_copy.groupby('card_id').agg(aggs)
 
    # Remove Multilevel Indexing with New Column Names
    df_historical_transactions_copy_group.columns = new_columns
    
    # Reset Index
    df_historical_transactions_copy_group.reset_index(drop=False,inplace=True)
    
    # Cast variable to pandas Datetime
    df_historical_transactions_copy_group['hist_purchase_date_max'] = pd.to_datetime(df_historical_transactions_copy_group['hist_purchase_date_max'])
    df_historical_transactions_copy_group['hist_purchase_date_min'] = pd.to_datetime(df_historical_transactions_copy_group['hist_purchase_date_min'])

    return df_historical_transactions_copy_group

def merge_data(key:str=None, dfs:List[DataFrame]=None)->DataFrame:
    """This function takes in multiple dataframes and performs a left outer join on a key.
    
    Parameters
    ----------
    key: str
    This species the joining key.
    
    dfs: list of pandas DataFrame
    This specifies the list of DataFrames to perform left outer join based on a key.
    
    Returns
    -------
    pandas DataFrame: This specifies the resultant DataFrame from the merging operation.
    """
    
    # Sanity Check
    if key is None:
        raise Exception("Expected a key, no key supplied.")
        
    if not isinstance(key, str):
        raise Exception(f"Expected type str for key, {type(key)} was supplied.")
    
    if dfs is None or not len(dfs) == 2:
        raise Exception("Expected at least two DataFrame.")
        
    if any(type(x) != DataFrame for x in dfs):
        raise Exception("At least one DataFrame is not the correct DataType.")
        
    # Iterate through DataFrames to perform merge operation
    df_res = dfs[0]
    
    for df in dfs[1:]:
        df_res = pd.merge(left=df_res, right=df, how='left', left_on=key, right_on=key)
    
    return df_res

def zhenjie_miracle(df:DataFrame)->DataFrame:
    """This function perform feature engineering on purchase_max and purchase_min and one-hot encoding on the 
    input Data which must be a merged dataframe of train dataset and trans (hist and new) dataset after running 
    feature_engineering function"""
    
    # Make copy of df
    df_historical_transactions_copy_group = df.copy()

    brazil_holiday_list=[ 
            '01-01-17', '14-02-17', '28-08-17', '14-04-17', '16-04-17', '21-04-17',
            '01-05-17', '15-06-17', '07-09-17', '12-10-17', '02-11-17', '15-11-17', 
            '24-12-17', '25-12-17', '31-12-17',
            '01-01-18', '14-02-18', '28-08-18', '14-04-18', '16-04-18', '21-04-18',
            '01-05-18', '15-06-18', '07-09-18', '12-10-18', '02-11-18', '15-11-18', 
            '24-12-18', '25-12-18', '31-12-18'
      ]
    df_historical_transactions_copy_group['purchase_max_is_holiday'] = df_historical_transactions_copy_group['hist_purchase_date_max'].isin(brazil_holiday_list).astype(int)
    df_historical_transactions_copy_group['purchase_min_is_holiday'] = df_historical_transactions_copy_group['hist_purchase_date_min'].isin(brazil_holiday_list).astype(int)
    
    df_historical_transactions_copy_group_dummies = pd.get_dummies(df_historical_transactions_copy_group['feature_1'], prefix='feature_1', drop_first=True)
    df_historical_transactions_copy_group = pd.concat([df_historical_transactions_copy_group, df_historical_transactions_copy_group_dummies], axis=1)
    df_historical_transactions_copy_group_dummies = pd.get_dummies(df_historical_transactions_copy_group['feature_2'], prefix='feature_2', drop_first=True)
    df_historical_transactions_copy_group = pd.concat([df_historical_transactions_copy_group, df_historical_transactions_copy_group_dummies], axis=1)

    return df_historical_transactions_copy_group


def pengaik_miracle(df:DataFrame=None)->DataFrame:
    
    """This function perform feature engineering on average monthly purchase amount raw of positive month lags 
    over that of negative. It also performs feature engineering on the ratio of purchase amount raw of 
    month_lag=i/month_lag=i-1 for each card_id and returns the average ratio as a column
    input Data which must be a concat dataframe of trans (hist and new) dataset """
    
    transactions_copy = df.copy()
    
    # Reverse purchase_amount
    transactions_copy['purchase_amount_raw'] = np.round(transactions_copy['purchase_amount'] / 0.00150265118 + 497.06, 2)

    # Group transactions_copy by card_id and month_lag
    grouped_transactions_copy = transactions_copy.groupby(['card_id', 'month_lag']).agg({'purchase_amount_raw': 'mean'}).reset_index()

    # Separate transactions_copy into two groups based on month_lag
    lag_le_0 = grouped_transactions_copy[grouped_transactions_copy['month_lag'] <= 0]
    lag_gt_0 = grouped_transactions_copy[grouped_transactions_copy['month_lag'] > 0]

    # Calculate the monthly average purchase amount for each group
    lag_le_0_monthly_average_raw = lag_le_0.groupby('card_id')['purchase_amount_raw'].mean().reset_index().rename(columns={'purchase_amount_raw': 'monthly_average_purchase_amount_raw_for_month_lag_le_0'})
    lag_gt_0_monthly_average_raw = lag_gt_0.groupby('card_id')['purchase_amount_raw'].mean().reset_index().rename(columns={'purchase_amount_raw': 'monthly_average_purchase_amount_raw_for_month_lag_gt_0'})

    # Merge the new columns with the original transactions_copy dataframe
    transactions_copy = transactions_copy.merge(lag_le_0_monthly_average_raw, on='card_id', how='left')
    transactions_copy = transactions_copy.merge(lag_gt_0_monthly_average_raw, on='card_id', how='left')

    transactions_copy['ratio_between_ave_monthly_purchase_raw_for_positive_and_negative'] = transactions_copy['monthly_average_purchase_amount_raw_for_month_lag_gt_0'] / transactions_copy['monthly_average_purchase_amount_raw_for_month_lag_le_0']

    # Find the minimum month_lag for each card_id and set the index to 'card_id'
    min_month_lag_per_card = transactions_copy.groupby('card_id', as_index=False)['month_lag'].min().set_index('card_id')

    # Fill in missing month_lag values for each card_id
    unique_card_ids = transactions_copy['card_id'].unique()
    min_month_lag = transactions_copy['month_lag'].min()
    max_month_lag = transactions_copy['month_lag'].max()

    complete_data = []

    for card_id in unique_card_ids:
        # Use .loc[] accessor to look up the minimum month_lag for each card_id
        card_min_month_lag = min_month_lag_per_card.loc[card_id]['month_lag']
        for month_lag in range(card_min_month_lag, max_month_lag + 1):
            complete_data.append([card_id, month_lag, 0])

    complete_transactions_copy = pd.DataFrame(complete_data, columns=['card_id', 'month_lag', 'purchase_amount_raw'])

    # Compute the purchase_amount_raw sum for each card_id and month_lag combination
    grouped_transactions_copy = transactions_copy.groupby(['card_id', 'month_lag'], as_index=False)['purchase_amount_raw'].sum()

    # Merge the complete_transactions_copy dataframe with the grouped_transactions_copy dataframe
    merged_transactions_copy = pd.merge(complete_transactions_copy, grouped_transactions_copy, on=['card_id', 'month_lag'], how='left', suffixes=('', '_y'))
    merged_transactions_copy['purchase_amount_raw'] = merged_transactions_copy['purchase_amount_raw_y'].fillna(merged_transactions_copy['purchase_amount_raw'])

    # Calculate the ratio of purchase_amount_raw for each month_lag=i/month_lag=i-1
    merged_transactions_copy['prev_month_purchase_amount'] = merged_transactions_copy.groupby('card_id')['purchase_amount_raw'].shift(1)
    merged_transactions_copy['ratio'] = np.where(merged_transactions_copy['prev_month_purchase_amount'] != 0, merged_transactions_copy['purchase_amount_raw'] / merged_transactions_copy['prev_month_purchase_amount'], np.nan)

    # Compute the average of these ratios for each card_id
    average_ratios = merged_transactions_copy.groupby('card_id', as_index=False)['ratio'].mean()

    # Handling division by zero cases by replacing np.inf with np.nan and then replacing np.nan with a suitable value (e.g., 1)
    average_ratios['ratio'] = average_ratios['ratio'].replace([np.inf, -np.inf], np.nan).fillna(1)

    # Merge average_ratios with transactions_copy DataFrame
    feature_engineered_transactions_copy = transactions_copy.merge(average_ratios, on='card_id', how='left')

    return feature_engineered_transactions_copy



ModelRegressor = Union[LinearRegression, DecisionTreeRegressor, RandomForestRegressor]
def feature_selection(approach:str="RFE", 
                      k:int=10, 
                      train:DataFrame=None, 
                      test:DataFrame=None,
                      model:ModelRegressor=None)->List[str]:
    """This function performs feature selection based on the user's choice of approach.
    
    Usage
    -----
    >> features = feature_selection(approach="LGBM", train=X_train, test=y_train)
    """
    if approach == 'LGBM':
        # LGTM Regressor to pick out important features
        gbm = lgb.LGBMRegressor()
        gbm.fit(train, test)

        # Feature Important Viz
        fea_imp_ = pd.DataFrame({'variable':train.columns, 'feature_importance':gbm.feature_importances_})
        fea_imp_sorted = fea_imp_.sort_values(by='feature_importance', ascending=False)
        return list(fea_imp_sorted[:k]['variable'])
    if approach == 'RFE':
        rfe = RFE(estimator=model, n_features_to_select=k)
        rfe = rfe.fit(train, test)

        # summarize the ranking of the attributes
        fea_rank_ = pd.DataFrame({'variable': train.columns, 'feature_importance':rfe.ranking_})
        fea_rank_sorted = fea_rank_.sort_values(by='feature_importance', ascending=False)
        return list(fea_rank_sorted[:k]['variable'])

    
def build_train_test_sets(df:DataFrame=None, 
                          features:List[str]=None, 
                          target:str=None, 
                          verbose:int=0, 
                          **kwargs:dict)->List[DataFrame]:
    """This function splits the given dataframe into train and test data.
    
    Parameters
    ----------
    Args:
        df: DataFrame
        This specifies the source DataFrame.

        features: list of str
        This list containing a str-type elements specifies the name of the features.

        target: str
        This specifies the target variable.
        
        verbose: int-type
            This species the verbosity of the function.
    
    Kwargs
        A dict mapping the corresponding parameters for scikit learn model selection. 
        
        {"test_size": 0.05, "seed": None}
    
        If a key from the keys argument is missing from the settings, the default will be used.
        
    Returns
    -------
    list: This species the list containing X_train, X_test, y_test and y_train DataFrame.
    """
    # Default 
    model_params = {
        'test_size': 0.05,
        "seed": None
    }
    
    # Sanity Check
    if df is None:
        raise ValueError("Expected a DataFrame, no DataFrame supplied.")
        
    if features is None:
        raise Exception("Expected a features list, no features list supplied.")
    
    if not isinstance(features, list):
        raise Exception(f"Expected list datatype for features, {type(features)} was supplied.")
        
    if not isinstance(target, str):
        raise Exception(f"Expected str datatype for target, {type(target)} was supplied.")
        
    # Check for Kwargs
    if "test_size" in kwargs:
        model_params['test_size'] = kwargs["test_size"]
    if "seed" in kwargs:
        model_params['seed'] = kwargs["seed"]
        
    seed = "No Seed" if model_params["seed"] is None else model_params['seed']
    if verbose != 0:
        print(f"***Parameters for Model Selection***\ntest_size {model_params['test_size']}\nseed: {seed}\n")
    
    if seed != "No Seed":
        X_train, X_test, y_train, y_test = train_test_split(df[features], df[[target]], test_size=model_params['test_size'])
        return X_train, X_test, y_train, y_test
    else:
        X_train, X_test, y_train, y_test = train_test_split(df[features], df[[target]], test_size=model_params['test_size'], random_state=model_params['seed'])
        return X_train, X_test, y_train, y_test
    
def train_eval_model(model:ModelRegressor,  X_train, X_test, y_train, y_test, name)->DataFrame:
    """This function trains and evaluates the model.
    By default, the score used it RMSE.

    Parameters
    ----------

    Args:
        model: model
        This species the model to use for training and evaluation.
       
    Kwargs
        A dict mapping the corresponding parameters for training and test data. 
        
        {"X_train": ..., "X_test": ..., "y_train": ..., "y_test": ...}
    
    Return 
    ------
    dataframe: This specifies resets from testing the model.
    """
    # Train
    model.fit(X_train, y_train)

    # Pred
    y_pred = model.predict(X_test)

    # Eval
    print(f"{name} Score:", math.sqrt(mean_squared_error(y_test, y_pred)))
    
    return y_pred


def Bayesian_Optimization(objective_function, parameters_dict, n_init_random_explorations=10, n_iter = 50):
    """Find the hyperparameters that maximizes a given objective (e.g test result)
    Parameters
    ----------
    Args:
        objective_function: function
        function that outputs a value, which BO will try to maximize. 

        parameters_dict: Dictionary
        Contain hyperparameters that you want to optimize. Key is hyperparameter name, value is (min, max) value of that hyperparameters
        
        n_init_random_explorations: int
        Number of random sets of hyperparameters to try. <n_init_random_explorations> random hp sets are explored before <n_iter> systematic explorations are run
        
        n_iter: int
        Number of iterations to run
        
    Returns
    -------
    Dict: Best set of hyperparameters


    Usage
    -------
    The key in pbounds must match the parameters in objective_function!!!

    def objective_function(n, beta, gamma) -> float:
      n = int(n)
      model = BetaVae(
          n_latent=n,
          beta=beta,
          n_chan=N_CHAN,
          input_d=INPUT_DIM,
          batch=BATCH,
          gamma = gamma,
          )
      model.train_self(
          data_path=TRAIN_PATH,
          epochs=1,
          weights_file=f'bvae_n{n}_b{beta}_{"bw" if N_CHAN == 1 else ""}_'
                      f'{INPUT_DIM[0]}x{INPUT_DIM[1]}.pt')
      return model.test(TEST_PATH, iters=1)

    parameters_dict = {'n': (5, 200), 'beta': (0.1,30), 'gamma': (0.001, 30)}

    def Bayesian_Optimization(objective_function, parameters_dict, n_iter = 50):
      optimizer = BayesianOptimization(
          f=objective_function,
          pbounds= parameters_dict,
          verbose=2,
          random_state=1)
    
    best_hypers = Bayesian_Optimization(objective_function, parameters_dict, n_iter = 50)
    """
    optimizer = BayesianOptimization(
        f=objective_function, #define before this function
        pbounds=parameters_dict, 
        random_state=1)
    optimizer.maximize(init_points=n_init_random_explorations, n_iter=n_iter)
    print('#################################################################')
    print(f'Found Network with Optimal target result of {optimizer.max["target"]}')
    print(f'Parameters: {optimizer.max["params"]}')
    print('#################################################################')
    return optimizer.max["params"]


def createData(df:DataFrame=None, df_t:DataFrame=None)->DataFrame:
    """This function transform the given datasets into a suitable dataset for training/testing.
    
    algorithm
        0. (optional) Subset the data based on card_ids in train/test (improve performance)
        1. PA Miracle
        2. Impute with Mode
        3. Feature Engineering
        4. Merge df_transactions from 1 - 3 to train/test (IMPORTANT)
        5. ZJ Miracle
        6. Remove unnecessary columns
    endalgorithm
    
    Parameters
    ----------
    df: DataFrame
    This specifies the dataframe containing transactions details. Ideally, this should be a combination of 
    new_historical and historical transactions datagframes.
    
    df_t: DataFrame
    This specifies the train or test dataframe.
    
    Returns
    -------
    DataFrame: Valid DataFrame after the preprocessing and imputations
    """
    # Run Pengaik's Miracle
    df_transactions_t = pengaik_miracle(df)
    print("Card_ID Uniqueness (PA Miracle)", len(df_transactions_t['card_id'].unique()) == len(df_t['card_id'].unique()))
    
    # Impute with Mode
    df_impute_mode = preprocess_data(df=df_transactions_t)
    print("Card_ID Uniqueness (Impute Mode)", len(df_impute_mode['card_id'].unique()) == len(df_t['card_id'].unique()))
    
    # Store PA's Ratios
    df_ratios = df_impute_mode[['card_id',
       'monthly_average_purchase_amount_raw_for_month_lag_le_0',
       'monthly_average_purchase_amount_raw_for_month_lag_gt_0',
       'ratio_between_ave_monthly_purchase_raw_for_positive_and_negative',
       'ratio']].drop_duplicates()

    # Sanity Check for n_rows and uniqueness of card_id
    print("Card_ID Uniqueness", len(df_ratios['card_id'].unique()) == len(df_test['card_id'].unique()))
    
    # Sanity Check for n_rows and uniqueness of card_id
    print("Card_ID Uniqueness (PA Ratio)", len(df_ratios['card_id'].unique()) == len(df_test['card_id'].unique()))
    
    # Feature Engineering
    df_aggregated_cols = feature_engineering(df_impute_mode)
    print("Card_ID Uniqueness (Feature Engineering)", len(df_aggregated_cols['card_id'].unique()) == len(df_t['card_id'].unique()))
    
    # Merge PA's Ratio
    df_aggregated_cols = merge_data('card_id', [df_ratios, df_aggregated_cols])

    # Sanity Check for n_rows and uniqueness of card_id
    print("Card_ID Uniqueness (Merge PA Ratio)", len(df_aggregated_cols['card_id'].unique()) == len(df_test['card_id'].unique()))

    # Merge Transactions and T
    df_t_merge = merge_data('card_id', [df_t, df_aggregated_cols])
    print("Card_ID Uniqueness (Merging)", len(df_t_merge['card_id'].unique()) == len(df_t['card_id'].unique()))
    
    # Execute Zhen Jie's Miracle
    df_t_merge = zhenjie_miracle(df_t_merge)
    print("Card_ID Uniqueness (ZJ Miracle)", len(df_t_merge['card_id'].unique()) == len(df_t['card_id'].unique()))
    
    # Impute Missing Data
    df_t_merge.fillna("2017-01", inplace=True)

    # Engineered by Zhen Jie so Remove
    df_t_merge.drop(columns=['hist_purchase_date_max', 'hist_purchase_date_min'], inplace=True)
    print("Card_ID Uniqueness (Imputations and Drop)", len(df_test_merge['card_id'].unique()) == len(df_test['card_id'].unique()))
    
    # Final Sanity Check
    print("***FINAL***")
    print('n_rows:', format(df_t_merge.shape[0], "_"), end='\n\n')
    print("Columns:\n", ", ".join(df_t_merge.columns), sep='')
    
    return df_t_merge

### Datasets <a id="datasets"></a>

1. Customer has a ```card_id``` as uuid.
2. Each customer can make at least one transaction to merchants.
3. Merchant has ```merchant_id``` as uuid.

In [3]:
# Historical Transactions
tp = pd.read_csv(f'{INPUT_ELO_DIR}/historical_transactions.csv', iterator=True, chunksize=2_000_000)  # gives TextFileReader, which is iterable with chunks of 1000 rows.
df_historical_transactions = pd.concat(tp, ignore_index=True) 

# New Historical Transactions
tp = pd.read_csv(f'{INPUT_ELO_DIR}/new_merchant_transactions.csv', iterator=True, chunksize=2_000_000)  # gives TextFileReader, which is iterable with chunks of 1000 rows.
df_new_historical_transactions = pd.concat(tp, ignore_index=True) 

# Train Data
df_train = pd.read_csv(f'{INPUT_ELO_DIR}/train.csv')

# Engineered Train Data 
tp = pd.read_csv(f'{INPUT_PREPROCESSED_DIR}/output.csv',index_col=0, iterator=True, chunksize=5_000_000)  # gives TextFileReader, which is iterable with chunks of 1000 rows.
df_train_merge = pd.concat(tp, ignore_index=True)

# Machine Learning - Build Custom-Model <a id="custom-model"></a>
From the previous notebook, experiments of binary classifier yield the following results:
- Best sampling strategy: SMOTE,R-Under(1:4),R-Over
- Best binary classifier: Bag-LGBM

As a recap of the previous notebook, the model architecture is depicted as follows:

<img src="./model_architecture.png"/>

There are 6 steps to achieve this:
1. Try sampling methods for binary classifier
2. Explore binary classification models
3. Explore regression models
4. Hyperparameter tuning for the selected binary classification model
5. Hyperparameter tuning for the regression models
6. Explore and hyperparameter-tune meta-model

Some setup before running Step 4 and beyond

In [4]:
# Import libraries used in this case
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization

# Imbalanced Learning (Sampling)
import imblearn
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Metrics
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

# Utilities
from collections import Counter

df_train_merge['outlier'] = df_train_merge['target'] < -20
features = list(df_train_merge.drop(columns=['target', 'card_id', 'outlier']).columns)

In [5]:
features_impt_clf = feature_selection(k=40, train=df_train_merge[features], test=df_train_merge['target'], model=Ridge(random_state=SEED))
print("********* Top 40 features for bin clf *********")
print(features_impt_clf)

def custom_classifier_train(df, bin_classifier, samplers, rare_threshold, show_matrix=False, cv=True, verbose=True):
    # Prepare data
    features = list(df.drop(columns=['target', 'card_id', 'outlier']).columns)
    target = 'outlier'
    scores = []
    
    def train_and_score(X_train, y_train, X_test, y_test):
        X_res, y_res = X_train, y_train
        for sampler in samplers:
            X_res, y_res = sampler.fit_resample(X_res, y_res)
        bin_classifier.fit(X_res, y_res)

        # Visualise the performance of the Classifier x Sampler
        y_pred = bin_classifier.predict_proba(X_test)[:, 1] > rare_threshold

        if show_matrix:
            cm = confusion_matrix(y_test, y_pred)
            ConfusionMatrixDisplay(cm).plot()
        return f1_score(y_test, y_pred)
    
    # Model that predicts a data point is rare    
    X, y = df[features_impt_clf], df[[target]]
    if not cv:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)
        score = train_and_score(X_train, y_train, X_test, y_test)
    else:
        kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
        for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
            if verbose:
                print(f"Fold {i}")
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_test, y_test = X.iloc[val_idx], y.iloc[val_idx]
            score = train_and_score(X_train, y_train, X_test, y_test)
            scores.append(score)
        score = np.mean(scores)
        
    return score

********* Top 40 features for bin clf *********
['hist_purchase_amount_var', 'monthly_average_purchase_amount_raw_for_month_lag_le_0', 'ratio', 'hist_installments_var', 'hist_purchase_amount_sum', 'hist_purchase_amount_max', 'hist_weekend_mean', 'hist_month_lag_mean', 'hist_month_lag_var', 'hist_purchase_amount_min', 'hist_purchase_amount_mean', 'hist_month_diff_mean', 'hist_weekend_sum', 'hist_category_2_mean_mean', 'hist_month_lag_max', 'hist_category_3_mean_mean', 'purchase_max_is_holiday', 'purchase_min_is_holiday', 'feature_1_2', 'feature_1_3', 'feature_1_4', 'feature_1_5', 'feature_2_2', 'hist_month_lag_min', 'feature_1', 'feature_2', 'hist_state_id_nunique', 'feature_3', 'monthly_average_purchase_amount_raw_for_month_lag_gt_0', 'ratio_between_ave_monthly_purchase_raw_for_positive_and_negative', 'hist_month_nunique', 'hist_hour_nunique', 'hist_weekofyear_nunique', 'hist_dayofweek_nunique', 'hist_year_nunique', 'hist_subsector_id_nunique', 'hist_installments_mean', 'hist_authorize

In [6]:
sor = RandomOverSampler(random_state=SEED)
sur_random_half = RandomUnderSampler(random_state=SEED)   # Set rare:non-rare as 1:1
sur_random_fifth = RandomUnderSampler(sampling_strategy={0: df_train_merge.outlier.sum()*7}, random_state=SEED)   # Set rare:non-rare as 1:4
su_tl = TomekLinks()
so_smote = SMOTE()
su_nm = NearMiss()

samplers = [[sor], [sur_random_half], [sur_random_fifth], [su_tl], [so_smote], [su_nm], [so_smote, sur_random_fifth, sor], [so_smote, sur_random_half, sor],
           [so_smote, su_tl]]
sampler_names = ['R-Over', 'R-Under(1:1)', 'R-Under(1:4)', 'Tomek', 'SMOTE', 'NearMiss', 'SMOTE,R-Under(1:4),R-Over', 
                 'SMOTE,R-Under(1:1),R-Over', 'SMOTE,Tomek']

## 4. HP Tuning for Binary Classifier

In [8]:
def custom_classifier_objfunc(rare_threshold, n_estimators, max_depth, num_leaves):
    # RF, SMOTE oversampler performs the best over any combination of BinClassifier x Sampler x Weights (upweighted or not)
    clf_baglgb = BaggingClassifier(LGBMClassifier(max_depth=int(max_depth), num_leaves=int(num_leaves), random_state=SEED),
                                   n_estimators=int(n_estimators),
                                   random_state=SEED)
    
    sor = RandomOverSampler(random_state=SEED)
    sur_random_fifth = RandomUnderSampler(sampling_strategy={0: df_train_merge.outlier.sum()*7}, random_state=SEED)   # Set rare:non-rare as 1:4
    so_smote = SMOTE()
    samplers = [so_smote, sur_random_fifth, sor]
    
    # Arguments
    df = df_train_merge
    bin_classifier = clf_baglgb
    
    score = custom_classifier_train(df, bin_classifier, samplers, rare_threshold, verbose=False)
    return score


pbounds = { 
    'rare_threshold': (0.05, 0.85),
    'n_estimators': (10, 100),
    'max_depth': (2, 10),
    'num_leaves': (2, 50)
}

# best_params_clf = Bayesian_Optimization(custom_classifier_objfunc, pbounds, n_init_random_explorations=5, n_iter=10)  # COMPLETED
best_params_clf = {'max_depth': 5.1741397938453595, 'n_estimators': 58.493506060302124, 'num_leaves': 22.121336691358152, 'rare_threshold': 0.5981756003174076}

In [9]:
# Train the Classifier Model
bin_classifier = BaggingClassifier(LGBMClassifier(max_depth=int(best_params_clf['max_depth']), num_leaves=int(best_params_clf['num_leaves']), random_state=SEED),
                                   n_estimators=int(best_params_clf['n_estimators']), random_state=SEED)

# Results from previous version: best_samplers
best_samplers = [so_smote, sur_random_fifth, sor]

X_res, y_res = df_train_merge[features_impt_clf], df_train_merge['outlier']
for sampler in best_samplers:
    X_res, y_res = sampler.fit_resample(X_res, y_res)
bin_classifier.fit(X_res, y_res)

BaggingClassifier(base_estimator=LGBMClassifier(max_depth=5, num_leaves=22,
                                                random_state=123),
                  n_estimators=58, random_state=123)

## 5. HP Tuning of the First-level Regression Models
The first cell is similar to the first two cells in Step 3 to have the necessary setup

In [10]:
# Create three datasets: to train regression on full dataset; concentrated outlier dataset; less concentrated
global df_rare
global df_non_rare
df_rare, df_non_rare = df_train_merge[df_train_merge['outlier'] == 1], df_train_merge[df_train_merge['outlier'] == 0]
n = int(0.2 * len(df_rare))

df_outlier_more = pd.concat([df_rare, df_non_rare.sample(n)])
df_outlier_less = pd.concat([df_non_rare, df_rare.sample(n)])

global features_impt_full
features_impt_full = feature_selection(approach='LGBM', k=35, train=df_train_merge[features], test=df_train_merge['target'])
print("*********************** Top 35 features for Regressors **************************")
print(features_impt_full)

*********************** Top 35 features for Regressors **************************
['hist_month_diff_mean', 'hist_month_lag_mean', 'hist_month_lag_var', 'ratio_between_ave_monthly_purchase_raw_for_positive_and_negative', 'hist_authorized_flag_mean', 'monthly_average_purchase_amount_raw_for_month_lag_gt_0', 'hist_category_1_sum', 'hist_category_1_mean', 'hist_weekend_mean', 'ratio', 'monthly_average_purchase_amount_raw_for_month_lag_le_0', 'hist_purchase_amount_max', 'hist_purchase_amount_min', 'hist_weekofyear_nunique', 'hist_installments_sum', 'hist_purchase_amount_var', 'hist_month_lag_max', 'hist_category_2_mean_mean', 'hist_purchase_amount_sum', 'hist_subsector_id_nunique', 'hist_category_3_mean_mean', 'hist_installments_mean', 'hist_installments_var', 'hist_card_id_size', 'hist_month_nunique', 'hist_purchase_amount_mean', 'hist_authorized_flag_sum', 'hist_hour_nunique', 'hist_weekend_sum', 'hist_month_lag_min', 'feature_2', 'feature_1', 'hist_state_id_nunique', 'feature_1_3', 'hist

In [11]:
def custom_reg_train(df, reg, features, cv=True, verbose=True):
    # Prepare data
#     features = list(df.drop(columns=['target', 'card_id', 'outlier']).columns)
    target = 'target'
    costs = []
    
    # Model that predicts a data point is rare    
    X, y = df[features], df[[target]]
    if not cv:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_test)
        cost = np.sqrt(mean_squared_error(y_test, y_pred))
    else:
        costs = np.sqrt(-cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error'))
        cost = costs.mean()
        
    return cost

As all 3 regressors should be LGBM, the same `custom_reg_objfunc()` is used

As the total tuning time of each regressor exceeds 12 hours, the hyperparams were obtained by having 3 notebooks that each tune a regressor. The line with `# COMPLETE` indicates the original line that is executed when that specific notebook is tuning that regressor, whereas the next line below that is the exact hyperparams that yield the best performance. For instance, the code to tune the regressor on concentrated rare points ends with the following two lines:
```py
# best_params_reg_rare = Bayesian_Optimization(custom_reg_objfunc, pbounds) # COMPLETED
best_params_reg_rare = {...REDACTED...}
```

In [15]:
def custom_reg_objfunc(num_leaves, learning_rate, n_estimators, max_depth, min_split_gain, min_child_weight):
    
    def train_and_eval(df_rare, df_non_rare):
        cost = np.NaN
        # Prepare data
        features = features_impt_full
        target = 'target'
        
        for i in range(5):
            df = pd.concat([df_rare, df_non_rare.sample(n)])
            df_outlier_less = pd.concat([df_non_rare, df_rare.sample(n)])
            
            X, y = df[features], df[[target]]
            costs = np.sqrt(-cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error'))
            cost = costs.mean()
        
        return cost
    
    reg = lgb.LGBMRegressor(
        num_leaves=int(num_leaves),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_split_gain=min_split_gain,
        min_child_weight=min_child_weight,
        random_state=SEED
    )

    # Arguments
    df = df_train_merge
    
    cost = train_and_eval(df_rare, df_non_rare)
    return -cost


pbounds = {
    'num_leaves': (5, 50),
    'learning_rate': (0.01, 0.5),
    'n_estimators': (100, 1000),
    'max_depth': (3, 10),
    'min_split_gain': (0.001, 0.1),
    'min_child_weight': (5, 50)
}

# best_params_reg_rare = Bayesian_Optimization(custom_reg_objfunc, pbounds) # COMPLETED
best_params_reg_rare = {'learning_rate': 0.015170978234846356, 'max_depth': 8.182998613590625, 'min_child_weight': 36.101012674105675, 'min_split_gain': 0.0666169341089749, 'n_estimators': 954.3342602764994, 'num_leaves': 11.971532436721898}

In [16]:
def custom_reg_objfunc(num_leaves, learning_rate, n_estimators, max_depth, min_split_gain, min_child_weight):
    
    def train_and_eval(df_rare, df_non_rare):
        cost = np.NaN
        # Prepare data
        features = features_impt_full
        target = 'target'
        
        for i in range(5):
            df = pd.concat([df_non_rare, df_rare.sample(n)])
            
            X, y = df[features], df[[target]]
        
            costs = np.sqrt(-cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error'))
            cost = costs.mean()
        
        return cost
    
    reg = lgb.LGBMRegressor(
        num_leaves=int(num_leaves),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_split_gain=min_split_gain,
        min_child_weight=min_child_weight,
        random_state=SEED
    )

    # Arguments
    df = df_train_merge
    
    cost = train_and_eval(df_rare, df_non_rare)
    return -cost


pbounds = {
    'num_leaves': (5, 50),
    'learning_rate': (0.01, 0.5),
    'n_estimators': (100, 1000),
    'max_depth': (3, 10),
    'min_split_gain': (0.001, 0.1),
    'min_child_weight': (5, 50)
}

# best_params_reg_non_rare = Bayesian_Optimization(custom_reg_objfunc, pbounds) # COMPLETE
best_params_reg_non_rare = {'learning_rate': 0.10180087793228383, 'max_depth': 3.240145102154466, 'min_child_weight': 46.87061090154842, 'min_split_gain': 0.008862420216352685, 'n_estimators': 592.576399719306, 'num_leaves': 48.80270066231908}

In [17]:
def custom_reg_objfunc(num_leaves, learning_rate, n_estimators, max_depth, min_split_gain, min_child_weight):
    
    def train_and_eval(df):
        cost = np.NaN
        # Prepare data
        features = features_impt_full
        target = 'target'
            
        X, y = df[features], df[[target]]
        
        costs = np.sqrt(-cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error'))
        cost = costs.mean()
        
        return cost
    
    reg = lgb.LGBMRegressor(
        num_leaves=int(num_leaves),
        learning_rate=learning_rate,
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_split_gain=min_split_gain,
        min_child_weight=min_child_weight,
        random_state=SEED
    )

    # Arguments
    df = df_train_merge
    
    cost = train_and_eval(df)
    return -cost


pbounds = {
    'num_leaves': (5, 50),
    'learning_rate': (0.01, 0.5),
    'n_estimators': (100, 1000),
    'max_depth': (3, 10),
    'min_split_gain': (0.001, 0.1),
    'min_child_weight': (5, 50)
}

# best_params_reg_full = Bayesian_Optimization(custom_reg_objfunc, pbounds) # COMPLETE
best_params_reg_full = {'learning_rate': 0.01, 'max_depth': 10.0, 'min_child_weight': 32.80371387644972, 'min_split_gain': 0.001, 'n_estimators': 757.8955675847295, 'num_leaves': 23.642715185230898}

Train all regressors with their best hyperparams

In [18]:
reg_full = lgb.LGBMRegressor(
    num_leaves=int(best_params_reg_full['num_leaves']),
    learning_rate=best_params_reg_full['learning_rate'],
    n_estimators=int(best_params_reg_full['n_estimators']),
    max_depth=int(best_params_reg_full['max_depth']),
    min_split_gain=best_params_reg_full['min_split_gain'],
    min_child_weight=best_params_reg_full['min_child_weight'],
    random_state=SEED
)
reg_rare = lgb.LGBMRegressor(
    num_leaves=int(best_params_reg_rare['num_leaves']),
    learning_rate=best_params_reg_rare['learning_rate'],
    n_estimators=int(best_params_reg_rare['n_estimators']),
    max_depth=int(best_params_reg_rare['max_depth']),
    min_split_gain=best_params_reg_rare['min_split_gain'],
    min_child_weight=best_params_reg_rare['min_child_weight'],
    random_state=SEED
)
reg_non_rare = lgb.LGBMRegressor(
    num_leaves=int(best_params_reg_non_rare['num_leaves']),
    learning_rate=best_params_reg_non_rare['learning_rate'],
    n_estimators=int(best_params_reg_non_rare['n_estimators']),
    max_depth=int(best_params_reg_non_rare['max_depth']),
    min_split_gain=best_params_reg_non_rare['min_split_gain'],
    min_child_weight=best_params_reg_non_rare['min_child_weight'],
    random_state=SEED
)

reg_full.fit(df_train_merge[features_impt_full], df_train_merge['target'])
reg_rare.fit(df_outlier_more[features_impt_full], df_outlier_more['target'])
reg_non_rare.fit(df_outlier_less[features_impt_full], df_outlier_less['target'])

LGBMRegressor(learning_rate=0.10180087793228383, max_depth=3,
              min_child_weight=46.87061090154842,
              min_split_gain=0.008862420216352685, n_estimators=592,
              num_leaves=48, random_state=123)

## 6. Explore and HP Tune Meta Model

In [19]:
from sklearn.preprocessing import StandardScaler
rare_prob_scaler = StandardScaler()
overall_pred_scaler = StandardScaler()
low_rare_pred_scaler = StandardScaler()
high_rare_pred_scaler = StandardScaler()

# Prepare Dataset
rare_prob = bin_classifier.predict_proba(df_train_merge[features_impt_clf])[:, 1].reshape(-1,1)
overall_pred = reg_full.predict(df_train_merge[features_impt_full]).reshape(-1,1)
low_rare_pred = reg_rare.predict(df_train_merge[features_impt_full]).reshape(-1,1)
high_rare_pred = reg_non_rare.predict(df_train_merge[features_impt_full]).reshape(-1,1)


rare_prob_scaled = rare_prob_scaler.fit_transform(rare_prob).ravel()
overall_pred_scaled = overall_pred_scaler.fit_transform(overall_pred).ravel()
low_rare_pred_scaled = low_rare_pred_scaler.fit_transform(low_rare_pred).ravel()
high_rare_pred_scaled = high_rare_pred_scaler.fit_transform(high_rare_pred).ravel()

df_stack = pd.DataFrame({'low_conc_pred': low_rare_pred_scaled, 
                         'high_conc_pred': high_rare_pred_scaled, 
                         'overall_pred': overall_pred_scaled, 
                         'rare_prob': rare_prob_scaled,
                         'target': df_train_merge['target']
                        })
features_stack = df_stack.drop(['target'], axis=1).columns

In [20]:
# Individual models
reg_lr = LinearRegression()
reg_ridge = Ridge(random_state=SEED)
reg_lasso = Lasso(random_state=SEED)
reg_mlp = MLPRegressor(random_state=SEED, max_iter=300)
reg_gbm = lgb.LGBMRegressor()

# Prepare variables
regressors = [reg_lr, reg_ridge, reg_lasso, reg_mlp, reg_gbm]
model_names = ['Linear', 'Ridge', 'Lasso', 'MLP', 'LGBM']
dict_models = {reg_name: reg_object for reg_name, reg_object in zip(model_names, regressors)}

df = df_stack  # Need to change to latest
results = []

for reg in regressors:
    print(f"Using model {reg.__class__.__name__}")
    cost = custom_reg_train(df, reg, features_stack)
    results.append(cost)

df_exp_reg = pd.DataFrame({'RMSE': results}, index=model_names)
name_best_reg = df_exp_reg['RMSE'].idxmin()
best_reg = dict_models[name_best_reg]
print(f"*** Decision: Choose {name_best_reg} ***")
df_exp_reg

Using model LinearRegression
Using model Ridge
Using model Lasso
Using model MLPRegressor
Using model LGBMRegressor
*** Decision: Choose MLP ***


Unnamed: 0,RMSE
Linear,3.521088
Ridge,3.521088
Lasso,3.666426
MLP,3.481544
LGBM,3.541103


In [21]:
def custom_reg_objfunc(hidden_layer_sizes, max_iter, learning_rate_init):
    
    def train_and_eval(df):
        cost = np.NaN
        
        # Prepare data
        features = list(df.drop(columns=['target']).columns)
        target = 'target'
        
        X, y = df[features], df[[target]]
        costs = np.sqrt(-cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error'))
        cost = costs.mean()
        
        return cost
    
    reg = MLPRegressor(hidden_layer_sizes=int(hidden_layer_sizes), 
                       max_iter=int(max_iter), 
                       learning_rate_init=learning_rate_init, 
                       random_state=SEED)

    # Arguments
    df = df_stack
    
    cost = train_and_eval(df)
    return -cost


pbounds = {
    'hidden_layer_sizes': (5, 100),
    'max_iter': (1, 10),
    'learning_rate_init': (0.0001, 0.01)
}

# best_params_reg_meta = Bayesian_Optimization(custom_reg_objfunc, pbounds, n_init_random_explorations=10, n_iter=50) # COMPLETE
best_params_reg_meta = {'hidden_layer_sizes': 37.17855696044765, 'learning_rate_init': 0.002646738138804039, 'max_iter': 9.9545564395736}

|   iter    |  target   | hidden... | learni... | max_iter  |
-------------------------------------------------------------
| [0m1        [0m | [0m-3.494   [0m | [0m44.62    [0m | [0m0.007231 [0m | [0m1.001    [0m |
| [0m2        [0m | [0m-3.502   [0m | [0m33.72    [0m | [0m0.001553 [0m | [0m1.831    [0m |
| [95m3        [0m | [95m-3.486   [0m | [95m22.69    [0m | [95m0.003521 [0m | [95m4.571    [0m |
| [95m4        [0m | [95m-3.483   [0m | [95m56.19    [0m | [95m0.00425  [0m | [95m7.167    [0m |
| [0m5        [0m | [0m-3.493   [0m | [0m24.42    [0m | [0m0.008793 [0m | [0m1.246    [0m |
| [0m6        [0m | [0m-3.489   [0m | [0m68.69    [0m | [0m0.004231 [0m | [0m6.028    [0m |
| [0m7        [0m | [0m-3.488   [0m | [0m18.34    [0m | [0m0.002061 [0m | [0m8.207    [0m |
| [0m8        [0m | [0m-3.495   [0m | [0m96.98    [0m | [0m0.003203 [0m | [0m7.231    [0m |
| [0m9        [0m | [0m-3.496   [0m | [0m88.2

In [22]:
reg_meta = MLPRegressor(hidden_layer_sizes=int(best_params_reg_meta['hidden_layer_sizes']), 
                        max_iter=int(best_params_reg_meta['max_iter']), 
                        learning_rate_init=best_params_reg_meta['learning_rate_init'], 
                        random_state=SEED)
reg_meta.fit(df_stack[features_stack], df_stack['target'])

MLPRegressor(hidden_layer_sizes=56, learning_rate_init=0.0022311977431408187,
             max_iter=7, random_state=123)

# Generate Output for Test Dataset

In [23]:
def model_for_production(dataset):
    features = dataset.drop(columns=['card_id']).columns
    
    rare_prob = bin_classifier.predict_proba(dataset[features_impt_clf])[:, 1].reshape(-1,1)
    overall_pred = reg_full.predict(dataset[features_impt_full]).reshape(-1,1)
    low_rare_pred = reg_rare.predict(dataset[features_impt_full]).reshape(-1,1)
    high_rare_pred = reg_non_rare.predict(dataset[features_impt_full]).reshape(-1,1)


    rare_prob_scaled = rare_prob_scaler.transform(rare_prob).ravel()
    overall_pred_scaled = overall_pred_scaler.transform(overall_pred).ravel()
    low_rare_pred_scaled = low_rare_pred_scaler.transform(low_rare_pred).ravel()
    high_rare_pred_scaled = high_rare_pred_scaler.transform(high_rare_pred).ravel()

    df_stack = pd.DataFrame({'low_conc_pred': low_rare_pred_scaled, 
                             'high_conc_pred': high_rare_pred_scaled, 
                             'overall_pred': overall_pred_scaled, 
                             'rare_prob': rare_prob_scaled
                            })
    return reg_meta.predict(df_stack)

In [24]:
# Prepare Test Dataset
df_test_merge = pd.read_csv(f'{INPUT_PREPROCESSED_DIR}/test_merged.csv', index_col=0)
df_test_merge.drop(['first_active_month'], axis=1, inplace=True)

card_id = df_test_merge.card_id.unique()

# Predict
y_test_predict = model_for_production(df_test_merge)
df_test_predict = pd.DataFrame({'card_id': card_id, 'target': y_test_predict})

# Save to csv
df_test_predict.to_csv('best_of_custom_model.csv', index=False)