## Packages

In [None]:
# Numerical Analysis
import pandas as pd
import numpy as np
import os
from collections import Counter

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

import xgboost as xgb


# Sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.under_sampling import RandomUnderSampler

# Data Setup

In [None]:
# Read data
trainFileName = 'train_ScotiaDSD.csv'
testFileName = 'test_ScotiaDSD.csv'
data = pd.read_csv(os.path.join(os.getcwd(), trainFileName))
evalData = pd.read_csv(os.path.join(os.getcwd(), testFileName))

In [None]:
# BINARY_COLUMNS (not including FRAUD_FLAG), STRING_COLUMNS, NUMERICAL_COLUMNS
BINARY_COLUMNS = [
 'CARD_NOT_PRESENT',
 'FLAG_LX',
 'FLAG_ATM',
 'FLAG_AUTO',
 'FLAG_CASH',
 'FLAG_LS',
 'FLAG_DISCOUNT',
 'FLAG_RECREA',
 'FLAG_ELCTRNCS',
 'FLAG_REG_AMT',
 'FLAG_FASTFOOD',
 'FLAG_GAS',
 'FLAG_HIGH_AMT',
 'FLAG_HIGH_RECREA',
 'FLAG_INTERNET',
 'FLAG_INTERNATIONAL',
 'FLAG_JEWELRY',
 'FLAG_LOW_AMT',
 'FLAG_MANUAL_ENTRY',
 'FLAG_PHONE_ORDER',
 'FLAG_PURCHASE_EXCLUDING_GAS',
 'FLAG_PLANNED',
 'FLAG_RISKY',
 'FLAG_SWIPE',
 'FLAG_TRAVEL_ONLY',
 'FLAG_TRAVEL_AND_ENTERTAINMENT',
 'FLAG_WEEKEND']

STRING_COLUMNS = ['TRANSACTION_ID',
'USER_AGENT',
'CITY',
'EVENT_TIME']

NUMERICAL_COLUMNS = ['EVENT_MONTH',
 'EVENT_DAY_OF_WEEK',
 'AVAIL_CRDT',
 'AMOUNT',
 'CREDIT_LIMIT',
 'MEAN_AUTO_PAST_7DAY',
 'MEAN_LS_PAST_7DAY',
 'MEAN_RECREA_PAST_7DAY',
 'MEAN_REG_AMT_PAST_7DAY',
 'MEAN_FASTFOOD_PAST_7DAY',
 'MEAN_HIGH_AMT_PAST_7DAY',
 'MEAN_HIGH_RECREA_PAST_7DAY',
 'MEAN_INTERNET_PAST_7DAY',
 'MEAN_INTERNATIONAL_PAST_7DAY',
 'MEAN_JEWELRY_PAST_7DAY',
 'MEAN_LOW_AMT_PAST_7DAY',
 'MEAN_MANUAL_ENTRY_PAST_7DAY',
 'MEAN_PHONE_ORDER_PAST_7DAY',
 'MEAN_PLANNED_PAST_7DAY',
 'MEAN_SWIPE_PAST_7DAY',
 'MEAN_TRAVEL_AND_ENTERTAINMENT_PAST_7DAY',
 'MEAN_WEEKEND_PAST_7DAY',
 'MAX_CASH_PAST_7DAY',
 'MAX_LS_PAST_7DAY',
 'MAX_RECREA_PAST_7DAY',
 'MAX_HIGH_AMT_PAST_7DAY',
 'MAX_HIGH_RECREA_PAST_7DAY',
 'MAX_INTERNET_PAST_7DAY',
 'MAX_PHONE_ORDER_PAST_7DAY',
 'MAX_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'MAX_SWIPE_PAST_7DAY',
 'MAX_WEEKEND_PAST_7DAY',
 'STD_LX_PAST_7DAY',
 'STD_FASTFOOD_PAST_7DAY',
 'STD_HIGH_AMT_PAST_7DAY',
 'STD_INTERNET_PAST_7DAY',
 'STD_LOW_AMT_PAST_7DAY',
 'STD_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'STD_SWIPE_PAST_7DAY',
 'STD_TRAVEL_AND_ENTERTAINMENT_PAST_7DAY',
 'SUM_LX_PAST_7DAY',
 'SUM_AUTO_PAST_7DAY',
 'SUM_LS_PAST_7DAY',
 'SUM_RECREA_PAST_7DAY',
 'SUM_GAS_PAST_7DAY',
 'SUM_HIGH_AMT_PAST_7DAY',
 'SUM_INTERNET_PAST_7DAY',
 'SUM_INTERNATIONAL_PAST_7DAY',
 'SUM_LOW_AMT_PAST_7DAY',
 'SUM_MANUAL_ENTRY_PAST_7DAY',
 'SUM_PHONE_ORDER_PAST_7DAY',
 'SUM_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'SUM_PARTIAL_PAST_7DAY',
 'SUM_PLANNED_PAST_7DAY',
 'SUM_SWIPE_PAST_7DAY',
 'SUM_WEEKEND_PAST_7DAY',
 'COUNT_AUTO_PAST_7DAY',
 'COUNT_ELCTRNCS_PAST_7DAY',
 'COUNT_GAS_PAST_7DAY',
 'COUNT_HIGH_AMT_PAST_7DAY',
 'COUNT_INTERNET_PAST_7DAY',
 'COUNT_LOW_AMT_PAST_7DAY',
 'COUNT_MANUAL_ENTRY_PAST_7DAY',
 'COUNT_PHONE_ORDER_PAST_7DAY',
 'COUNT_PURCHASE_EXCLUDING_GAS_PAST_7DAY',
 'COUNT_SWIPE_PAST_7DAY',
 'COUNT_TRAVEL_AND_ENTERTAINMENT_PAST_7DAY',
 'COUNT_WEEKEND_PAST_7DAY',
 'MEAN_AUTO_PAST_30DAY',
 'MEAN_DISCOUNT_PAST_30DAY',
 'MEAN_RECREA_PAST_30DAY',
 'MEAN_ELCTRNCS_PAST_30DAY',
 'MEAN_REG_AMT_PAST_30DAY',
 'MEAN_HIGH_AMT_PAST_30DAY',
 'MEAN_INTERNET_PAST_30DAY',
 'MEAN_LOW_AMT_PAST_30DAY',
 'MEAN_MANUAL_ENTRY_PAST_30DAY',
 'MEAN_PHONE_ORDER_PAST_30DAY',
 'MEAN_PURCHASE_EXCLUDING_GAS_PAST_30DAY',
 'MEAN_PLANNED_PAST_30DAY',
 'MEAN_SWIPE_PAST_30DAY',
 'MEAN_TRAVEL_AND_ENTERTAINMENT_PAST_30DAY',
 'MEAN_WEEKEND_PAST_30DAY',
 'MAX_AUTO_PAST_30DAY',
 'MAX_LS_PAST_30DAY',
 'MAX_ELCTRNCS_PAST_30DAY',
 'MAX_FASTFOOD_PAST_30DAY',
 'MAX_HIGH_RECREA_PAST_30DAY',
 'MAX_MANUAL_ENTRY_PAST_30DAY',
 'MAX_PHONE_ORDER_PAST_30DAY',
 'MAX_PARTIAL_PAST_30DAY',
 'MAX_RISKY_PAST_30DAY',
 'MAX_WEEKEND_PAST_30DAY',
 'STD_AUTO_PAST_30DAY',
 'STD_LS_PAST_30DAY',
 'STD_RECREA_PAST_30DAY',
 'STD_ELCTRNCS_PAST_30DAY',
 'STD_REG_AMT_PAST_30DAY',
 'STD_HIGH_RECREA_PAST_30DAY',
 'STD_INTERNET_PAST_30DAY',
 'STD_LOW_AMT_PAST_30DAY',
 'STD_MANUAL_ENTRY_PAST_30DAY',
 'STD_PHONE_ORDER_PAST_30DAY',
 'STD_PARTIAL_PAST_30DAY',
 'STD_SWIPE_PAST_30DAY',
 'STD_TRAVEL_ONLY_PAST_30DAY',
 'STD_TRAVEL_AND_ENTERTAINMENT_PAST_30DAY',
 'SUM_AUTO_PAST_30DAY',
 'SUM_LS_PAST_30DAY',
 'SUM_DISCOUNT_PAST_30DAY',
 'SUM_RECREA_PAST_30DAY',
 'SUM_ELCTRNCS_PAST_30DAY',
 'SUM_REG_AMT_PAST_30DAY',
 'SUM_FASTFOOD_PAST_30DAY',
 'SUM_GAS_PAST_30DAY',
 'SUM_HIGH_AMT_PAST_30DAY',
 'SUM_HIGH_RECREA_PAST_30DAY',
 'SUM_INTERNET_PAST_30DAY',
 'SUM_INTERNATIONAL_PAST_30DAY',
 'SUM_LOW_AMT_PAST_30DAY',
 'SUM_MANUAL_ENTRY_PAST_30DAY',
 'SUM_PHONE_ORDER_PAST_30DAY',
 'SUM_PURCHASE_EXCLUDING_GAS_PAST_30DAY',
 'SUM_SWIPE_PAST_30DAY',
 'SUM_TRAVEL_ONLY_PAST_30DAY',
 'SUM_WEEKEND_PAST_30DAY',
 'COUNT_AUTO_PAST_30DAY',
 'COUNT_RECREA_PAST_30DAY',
 'COUNT_REG_AMT_PAST_30DAY',
 'COUNT_FASTFOOD_PAST_30DAY',
 'COUNT_GAS_PAST_30DAY',
 'COUNT_HIGH_AMT_PAST_30DAY',
 'COUNT_INTERNET_PAST_30DAY',
 'COUNT_LOW_AMT_PAST_30DAY',
 'COUNT_MANUAL_ENTRY_PAST_30DAY',
 'COUNT_PHONE_ORDER_PAST_30DAY',
 'COUNT_PURCHASE_EXCLUDING_GAS_PAST_30DAY',
 'COUNT_PLANNED_PAST_30DAY',
 'COUNT_SWIPE_PAST_30DAY',
 'COUNT_TRAVEL_AND_ENTERTAINMENT_PAST_30DAY',
 'COUNT_WEEKEND_PAST_30DAY',
 'PREV_M_INFLATION',
 'PREV_M_UNEMP_RATE']

# Data Analysis
Determine any distinct qualities of the data and decide if any changes are required.

### Results
After having a deep-dive into the dataset, the following conclusions can be made:
- there are three distinct data types:
    - float64 (numerical)
    - int64 (binary)
    - object (string)
The string columns need to be converted into useful numerical features or dropped. For convention purposes, convert all numerical columns into floats and all binary into int (some counts are ints in the data). The actual string columns may be too granular for the model hence were removed.
- the magnitude of the variables differ greatly so normalized is required.
- imbalance of target data (97.5% to 2.5%), therefore oversampling methods need to be used.
- The columns with NAs are unnecessary:
    - for all cases where CITY and USER_AGENT is NA, it is when FLAG_INTERNET == 0
    - keep only the FLAG_INTERNET

In [None]:
# General Description of Data (answers above in markdown)
display(data.info())
display(data.head())
display(data.describe())
print(f'Number of Duplicated Rows: {data.duplicated().sum()}')
print(f'Number of Duplicated Columns: {len(data.columns) == len(np.unique(data.columns))}')
print(f"Proportion of Fraud vs Non-Fraud Transactions: \n {data['FRAUD_FLAG'].value_counts(normalize=True)}")
dataMissing = data.isna().sum(axis=0)
print(f'Number of Rows with NA: \n {dataMissing[dataMissing != 0]}')
print(f'Disparity between magnitude of Numerical Values: {data[NUMERICAL_COLUMNS].to_numpy().max()} and {data[NUMERICAL_COLUMNS].to_numpy().min()}')

# Data Pre-processing
Create new features and prepare for feature selection

In [None]:
def data_preprocessing(dataframe):
    """ Pre-process the data.
            - convert dtypes
            - change units to something more useful

    Args:
        data (DataFrame): pandas DataFrame

    Returns:
        DataFrame: same pandas Dataframe but preprocessed
    """
    # Convert binary variables into integers
    
    nonBinaryColumnNamesList = []
    for columnName in dataframe:
        # Binary variables only have 2 unique values
        if len(dataframe[columnName].unique()) == 2:
            dataframe[columnName] = dataframe[columnName].astype('int64')
        elif (dataframe[columnName].dtype == 'int64') or (dataframe[columnName].dtype == 'float64'):
            dataframe[columnName] = dataframe[columnName].astype('float64')

    # Convert EVENT_TIME string into seconds to be useful
    dataframe['EVENT_TIME'] = dataframe['EVENT_TIME'].apply(lambda x: x[:-1] if x[-1] == ':' else x)
    dateTransactionTime = pd.DataFrame(dataframe['EVENT_TIME'].str.split(r':').to_list(), columns=['hour', 'minutes'])
    dateTransactionTime = dateTransactionTime.astype('float64')

    dataTransactionTimeSeconds = dateTransactionTime['hour'] * 60 * 60 + dateTransactionTime['minutes'] * 60
    
    # Drop the useless columns now
    dataframe = dataframe.drop(STRING_COLUMNS, axis = 1)
    dataframe['EVENT_TIME_IN_SECONDS'] = dataTransactionTimeSeconds

    # Add addtional columns
    dataframe["FLAG_INTERNATIONAL_INTERNET"] = [int(x) for x in (dataframe["FLAG_INTERNATIONAL"]==1) & (dataframe["FLAG_INTERNET"]==1)]
    dataframe["FLAG_PAST_INTERNATIONAL_PURCHASE"] = [int(x) for x in (dataframe["FLAG_INTERNATIONAL_INTERNET"]==1) & (dataframe["MEAN_INTERNATIONAL_PAST_7DAY"]>0)]
    
    dataframe["CRDT_TO_AMOUNT"] = dataframe['AVAIL_CRDT'] / dataframe['AMOUNT']
    dataframe["AMOUNT_TO_LIMIT"] = dataframe['AMOUNT'] / (dataframe['CREDIT_LIMIT']+1)
    
    return dataframe

In [None]:
BINARY_COLUMNS += ['FLAG_INTERNATIONAL_INTERNET', 'FLAG_PAST_INTERNATIONAL_PURCHASE']
NUMERICAL_COLUMNS += ['AMOUNT_TO_LIMIT', 'EVENT_TIME_IN_SECONDS', 'CRDT_TO_AMOUNT']

In [None]:
dataPreprocessed = data_preprocessing(data)
dataEvalPreprocessed = data_preprocessing(evalData)

# External Data
Add any useful external features

- 2021 Canada Crime Statistic (https://www150.statcan.gc.ca/n1/en/pub/85-002-x/2022001/article/00013-eng.pdf?st=rc5-elU3)
- 2022 COVID-19 Cases (https://health-infobase.canada.ca/covid-19/)

In [None]:
# Convert the graph into estimated reported crime count
extData = pd.DataFrame({'EVENT_MONTH': [2,3,4,5,6,7],
              'REPORTED_CRIMINAL_CODE_CRIME_EXCULDING_TRAFFIC': [140000, 160000, 155000, 170000, 178000, 186000]})

NUMERICAL_COLUMNS.append('REPORTED_CRIMINAL_CODE_CRIME_EXCULDING_TRAFFIC')

In [None]:
# Determine the number of covid-19 cases in Canada in 2022 using pandas
covidData = pd.read_csv('covid19-data.csv')

# Ensure same name for easy join later
covidData['EVENT_MONTH'] = pd.to_datetime(covidData['date']).dt.month
covidData['year'] = pd.to_datetime(covidData['date']).dt.year

covidDataToJoin = covidData[(covidData['prname'] == 'Canada') & (covidData['year'] == 2022)].groupby(['EVENT_MONTH'])['totalcases'].sum()
covidDataToJoin.name = 'COVID_TOTAL_CASES'

NUMERICAL_COLUMNS.append('COVID_TOTAL_CASES')

In [None]:
# Join the dataframes to get new column(s)
dataProcessed = dataPreprocessed.merge(extData, on = 'EVENT_MONTH')
dataProcessed = dataProcessed.merge(covidDataToJoin, on = 'EVENT_MONTH')

dataEvalProcessed = dataEvalPreprocessed.merge(extData, on = 'EVENT_MONTH')
dataEvalProcessed = dataEvalProcessed.merge(covidDataToJoin, on = 'EVENT_MONTH')

# Feature Selection
Determine which features are important via Random Forest and PCA

In [None]:
# Normalize features to keep values in the same scale and improve accuracy/stabilize  models
def normalize(data, normalizerObject, train=True):
    """Normalize the data depending on the choice of normalizerObjecvt.

    Args:
        data (DataFrame): pandas DataFrame
        normalizerObject (sklearn type of normalizer): ex) StandardScaler

    Returns:
        DataFrame: normalized data
        Normalizer Object: fitted normalizer
    """
    if train:
        sc = normalizerObject()
        xNumericalNormalized = pd.DataFrame(sc.fit_transform(data), columns = data.columns, index=data.index)
    else:
        sc = normalizerObject
        xNumericalNormalized = pd.DataFrame(sc.transform(data), columns = data.columns, index=data.index)
        
    return xNumericalNormalized, sc

In [None]:
y = dataProcessed['FRAUD_FLAG']
X = dataProcessed.drop(['FRAUD_FLAG'], axis = 1)
xEval = dataEvalProcessed.drop(['FRAUD_FLAG'], axis = 1)

In [None]:
n_runs = 10
n_features = 100

rf = RandomForestClassifier(random_state=0, max_depth=7)
variableRank = Counter()

for i in range(n_runs):
    print(f'Run {i}:')
    xRF, _, yRF, _ = train_test_split(X, y, train_size=0.3, stratify=y, random_state=i)
    rf.fit(xRF, yRF)
    # Greatest to Least
    sort = (-rf.feature_importances_).argsort()
    for rank, col in enumerate(rf.feature_names_in_[sort]):
        variableRank[col] += rank

# Return the columns that have the lowest sum of the ranks
keepColumns = [x[0] for x in variableRank.most_common()[:(-len(X.columns)-1):-1]][:n_features]

In [None]:
# Keep only the "important" columns
xImportant = X[keepColumns]
xEvalImportant = xEval[keepColumns]
xImportant.info()

In [None]:
# Apply PCA on the numerical features to reduce the total number of features
n_components=50
pca = PCA(n_components)

numericalFeatures = xImportant.dtypes[xImportant.dtypes == 'float64'].index
binaryFeatures = xImportant.dtypes[xImportant.dtypes != 'float64'].index

# Normalize using StandardScaler (before PCA)
xNumericalNormalized, sc = normalize(xImportant[numericalFeatures], StandardScaler, train=True)
xEvalNumericalNormalized, _ = normalize(xEvalImportant[numericalFeatures], sc, train=False)

xNormalized = pd.concat([xNumericalNormalized, xImportant[binaryFeatures]], axis=1)
xEvalNormalized = pd.concat([xEvalNumericalNormalized, xEvalImportant[binaryFeatures]], axis=1)

# Fit and apply PCA transformation
xPCA = pca.fit_transform(xNumericalNormalized)
xEvalPCA = pca.transform(xEvalNumericalNormalized)

xContinuousImportant = pd.DataFrame(xPCA, columns = [f'V{i}'for i in range(n_components)])
xEvalContinuousImportant = pd.DataFrame(xEvalPCA, columns = [f'V{i}'for i in range(n_components)])

xImportant = pd.concat([xNormalized[binaryFeatures], xContinuousImportant], axis = 1)
xEvalImportant = pd.concat([xEvalNormalized[binaryFeatures], xEvalContinuousImportant], axis = 1)

# Data Sampling

### Synthetic Minority Oversampling TEchniques (SMOTE)

#### Description:
- randomly select a point in the minority class
- calculate the euclidean distance between each point in the minority class and find the $k$ nearest points
- randomly select a point among the $k$ points
- generate new data point (randomly on the line, the weight of the equation between two points is randomly generated as well)

### SMOTE-NC (Nominal Continuous)
Extension of SMOTE. In order to include categorical features into the synthetic data generation:
- randomly select a point in the minority class
- calculate the euclidean distance between each point in the minority class and find the $k$ nearest points
    - for each nominal category, substitute in the euclidean distance calculation the MEDIAN of STD of the continous classes
- randomly select a point among the $k$ points
- generate new data point
    - for continous values, same as SMOTE
    - for nominal values, take the randomly selected points nominal value

### SMOTE-ENC (Encoded Nominal and Continuous)
- essentially the same as before, but this time nominal are encoded and have numerical "weights"

In [None]:
# --------------------
# SMOTE-ENC Proposed in the paper -- adjusted code since it was out of date and for our use case
# --------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn.utils import check_array, sparsefuncs_fast, _safe_indexing, check_X_y, check_random_state
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from sklearn.neighbors import NearestNeighbors
from sklearn.base import clone
from numbers import Integral
from sklearn.svm import SVC
from collections import Counter
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, SMOTENC, SVMSMOTE
import os
# import missingpy as missingpy
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
import pickle
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, precision_recall_curve, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_validate

class MySMOTENC():
    
    def __init__(self, categorical_features):
        self.categorical_features = categorical_features
        
    def chk_neighbors(self, nn_object, additional_neighbor):
        if isinstance(nn_object, Integral):
            return NearestNeighbors(n_neighbors=nn_object + additional_neighbor)
        elif isinstance(nn_object, KNeighborsMixin):
            return clone(nn_object)
        else:
            raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object)     
    
    def generate_samples(self, X, nn_data, nn_num, rows, cols, steps, continuous_features_,):
        rng = check_random_state(42)

        diffs = nn_data[nn_num[rows, cols]] - X[rows]

        if sparse.issparse(X):
            sparse_func = type(X).__name__
            steps = getattr(sparse, sparse_func)(steps)
            X_new = X[rows] + steps.multiply(diffs)
        else:
            X_new = X[rows] + steps * diffs 

        X_new = (X_new.tolil() if sparse.issparse(X_new) else X_new)
        # convert to dense array since scipy.sparse doesn't handle 3D
        nn_data = (nn_data.toarray() if sparse.issparse(nn_data) else nn_data)

        all_neighbors = nn_data[nn_num[rows]]

        for idx in range(continuous_features_.size, X.shape[1]):

            mode = stats.mode(all_neighbors[:, :, idx], axis = 1)[0]

            X_new[:, idx] = np.ravel(mode)            
        return X_new
    
    def make_samples(self, X, y_dtype, y_type, nn_data, nn_num, n_samples, continuous_features_, step_size=1.0):
        random_state = check_random_state(42)
        samples_indices = random_state.randint(low=0, high=len(nn_num.flatten()), size=n_samples)    
        steps = step_size * random_state.uniform(size=n_samples)[:, np.newaxis]
        rows = np.floor_divide(samples_indices, nn_num.shape[1])
        cols = np.mod(samples_indices, nn_num.shape[1])

        X_new = self.generate_samples(X, nn_data, nn_num, rows, cols, steps, continuous_features_)
        y_new = np.full(n_samples, fill_value=y_type, dtype=y_dtype)
        
        return X_new, y_new
    
    def cat_corr_pandas(self, X, target_df, target_column, target_value):
    # X has categorical columns
        categorical_columns = list(X.columns)
        X = pd.concat([X, target_df], axis=1)

        # filter X for target value
        is_target = X.loc[:, target_column] == target_value
        X_filtered = X.loc[is_target, :]

        X_filtered.drop(target_column, axis=1, inplace=True)

        # get columns in X
        nrows = len(X)
        encoded_dict_list = []
        nan_dict = dict({})
        c = 0
        imb_ratio = len(X_filtered)/len(X)
        OE_dict = {}
        
        for column in categorical_columns:
            for level in list(X.loc[:, column].unique()):
                
                # filter rows where level is present
                row_level_filter = X.loc[:, column] == level
                rows_in_level = len(X.loc[row_level_filter, :])
                
                # number of rows in level where target is 1
                O = len(X.loc[is_target & row_level_filter, :])
                E = rows_in_level * imb_ratio
                # Encoded value = chi, i.e. (observed - expected)/expected
                ENC = (O - E) / E
                OE_dict[level] = ENC
                
            encoded_dict_list.append(OE_dict)

            X.loc[:, column] = X[column].map(OE_dict)

            nan_idx_array = np.ravel(np.argwhere(np.isnan(X.loc[:, column].to_numpy())))
            if len(nan_idx_array) > 0 :
                nan_dict[c] = nan_idx_array
            c = c + 1
            X.loc[:, column].fillna(-1, inplace = True)
            
        X.drop(target_column, axis=1, inplace=True)
        return X, encoded_dict_list, nan_dict

    def fit_resample(self, X, y):
        X_cat_encoded, encoded_dict_list, nan_dict = self.cat_corr_pandas(X.iloc[:,np.asarray(self.categorical_features)], y, target_column='FRAUD_FLAG', target_value=1)
#         X_cat_encoded = np.ravel(np.array(X_cat_encoded))
        X_cat_encoded = np.array(X_cat_encoded)
        y = np.ravel(y)
        X = np.array(X)

        unique, counts = np.unique(y, return_counts=True)
        target_stats = dict(zip(unique, counts))
        n_sample_majority = max(target_stats.values())
        class_majority = max(target_stats, key=target_stats.get)
        # MGW - DEBUG
        # sampling_strategy = {key: n_sample_majority - value for (key, value) in target_stats.items() if key != class_majority}
        sampling_strategy = {1: 40000}
        print(sampling_strategy)
        n_features_ = X.shape[1]
        categorical_features = np.asarray(self.categorical_features)
        if categorical_features.dtype.name == 'bool':
            categorical_features_ = np.flatnonzero(categorical_features)
        else:
            if any([cat not in np.arange(n_features_) for cat in categorical_features]):
                raise ValueError('Some of the categorical indices are out of range. Indices'
                            ' should be between 0 and {}'.format(n_features_))
            categorical_features_ = categorical_features

        continuous_features_ = np.setdiff1d(np.arange(n_features_),categorical_features_)

        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        X_continuous = X[:, continuous_features_]
        X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc'])
        X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority))

        if sparse.issparse(X):
            if X.format == 'csr':
                _, var = sparsefuncs_fast.csr_mean_variance_axis0(X_minority)
            else:
                _, var = sparsefuncs_fast.csc_mean_variance_axis0(X_minority)
        else:
            var = X_minority.var(axis=0)
        median_std_ = np.median(np.sqrt(var))

        X_categorical = X[:, categorical_features_]
        X_copy = np.hstack((X_continuous, X_categorical))

        X_cat_encoded = X_cat_encoded * median_std_

        X_encoded = np.hstack((X_continuous, X_cat_encoded))
        X_resampled = X_encoded.copy()
        y_resampled = y.copy()


        for class_sample, n_samples in sampling_strategy.items():
            if n_samples == 0:
                continue
            target_class_indices = np.flatnonzero(y == class_sample)
            X_class = _safe_indexing(X_encoded, target_class_indices)
            nn_k_ = self.chk_neighbors(5, 1)
            nn_k_.fit(X_class)

            nns = nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
            X_new, y_new = self.make_samples(X_class, y.dtype, class_sample, X_class, nns, n_samples, continuous_features_, 1.0)

            if sparse.issparse(X_new):
                X_resampled = sparse.vstack([X_resampled, X_new])
                sparse_func = 'tocsc' if X.format == 'csc' else 'tocsr'
                X_resampled = getattr(X_resampled, sparse_func)()
            else:
                X_resampled = np.vstack((X_resampled, X_new))
            y_resampled = np.hstack((y_resampled, y_new))
            
        X_resampled_copy = X_resampled.copy()
        i = 0
        for col in range(continuous_features_.size, X.shape[1]):
            encoded_dict = encoded_dict_list[i]
            i = i + 1
            for key, value in encoded_dict.items():
                X_resampled_copy[:, col] = np.where(np.round(X_resampled_copy[:, col], 4) == np.round(value * median_std_, 4), key, X_resampled_copy[:, col])

        for key, value in nan_dict.items():
            for item in value:
                X_resampled_copy[item, continuous_features_.size + key] = X_copy[item, continuous_features_.size + key]

               
        X_resampled = X_resampled_copy   
        indices_reordered = np.argsort(np.hstack((continuous_features_, categorical_features_)))
        if sparse.issparse(X_resampled):
            col_indices = X_resampled.indices.copy()
            for idx, col_idx in enumerate(indices_reordered):
                mask = X_resampled.indices == col_idx
                col_indices[mask] = idx
            X_resampled.indices = col_indices
        else:
            X_resampled = X_resampled[:, indices_reordered]
        return X_resampled, y_resampled

In [None]:
def dataSampling(X, y, methods):
    """ Sample the data set in order of method in methods.

    Args:
        X (DataFrame): covariates
        y (DataFrame): predictor
        methods (list): list of methods in order

    Returns:
        (DataFrame, DataFrame): newly sampled dataset
    """
    xNew, yNew = X, y
    for method in methods:
        print(method)
        print(f"Original: \n{(yNew == 1).sum()}, {(yNew == 0).sum()}")
        if (method == 'SMOTE') and np.any(np.where(xNew.dtypes == 'int64')[0]):
            sampler = SMOTENC(categorical_features=np.where(xNew.dtypes == 'int64')[0], sampling_strategy=0.2, random_state=0)
        elif method == 'SMOTENC':
            sampler = SMOTENC(categorical_features=np.where(xNew.dtypes == 'int64')[0], sampling_strategy=0.2, random_state=0)
        elif method == 'SMOTEENC':
            sampler = MySMOTENC(categorical_features=np.where(xNew.dtypes == 'int64')[0])
        elif method == 'OVER_SAMPLE':
            sampler = RandomOverSampler(sampling_strategy=0.5, random_state=0)
        elif method == 'UNDER_SAMPLE':
            sampler = RandomUnderSampler(sampling_strategy=0.8)
        xNew, yNew = sampler.fit_resample(xNew, yNew)
        print(f"New: \n{(yNew == 1).sum()}, {(yNew == 0).sum()}")
        print('---')
        
        if isinstance(xNew, type(np.array([]))):
            xNew = pd.DataFrame(xNew, columns = X.columns)
        if isinstance(yNew, type(np.array([]))):
            yNew = pd.Series(yNew, name = 'FRAUD_FLAG')
        
    
    return xNew, yNew

In [None]:
# Split the dataset to test accuracy on unseen data
xTrainDS, xTestDS, yTrainDS, yTestDS = train_test_split(xImportant, y, test_size=0.2, stratify=y)

# Determine the methods to apply (IN ORDER)
methods = ['SMOTEENC', 'UNDER_SAMPLE']
xTrainSampled, yTrainSampled = dataSampling(xTrainDS, yTrainDS, methods=methods)

# Modeling Step
Define helper functions for easy modeling and evaluating

In [None]:
def evaluate(y_true, y_pred):
    """ Evaluate the model. Return useful metrics such as:
            - confusion matrix
            - f1 score
            - percision
            - recall

    Args:
        y_true (Series): the true labels
        y_pred (Series): the predicted labels

    """
    # C_(i,j) = group i predicted to be in group j
    cm = confusion_matrix(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    percision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    
    print(f"F1 Score: {f1:.4f}")
    print(f"Percision: {percision:.4f}")
    print(f"Recall: {recall:.4f}")

    plt.figure(figsize=(4,4))
    plt.title('Confusion Matrix')
    sns.heatmap(cm, annot=True, fmt='.5g')
    return cm, f1, percision, recall

In [None]:
def modeling(xTrain, xTest, yTrain, yTest, model):
    """ Given the model, train and evaluate.

    Args:
        xTrain (DataFrame): training sets covariates
        xTest (DataFrame): test sets covariates
        yTrain (Series): training set labels
        yTest (Series): test set labels
        model (Model Type): any sklearn model (xgboost works as well)
    """

    # Fit the model
    model.fit(xTrain, yTrain)
    yPred = model.predict(xTest)
    
    # Evaluation
    cm, f1, percision, recall = evaluate(yTest, yPred)
    probs = model.predict_proba(xTest)[:,1]
    AUC = roc_auc_score(yTest, probs)
    print(f'AUC: {AUC}')
    return model, cm, AUC, f1, percision, recall, xTrain, xTest, yTrain, yTest

In [None]:
xFinal, yFinal = xTrainSampled, yTrainSampled
xTestFinal, yTestFinal = xTestDS, yTestDS

# Model Selection
Test different state-of-the-art supervised binary classification models and select the "best" one.
- Logistic
- Decision Tree
- Random Forest
- XGBoost

In [None]:
# Logisitic Regression
modelLR, cm, AUC, f1, percision, recall, xTrain, xTest, yTrain, yTest = modeling(xFinal, xTestFinal, yFinal, yTestFinal, LogisticRegression(random_state=0))

In [None]:
# Decision Tree
modelDT, cm, AUC, f1, percision, recall, xTrain, xTest, yTrain, yTest = modeling(xFinal, xTestFinal, yFinal, yTestFinal, DecisionTreeClassifier(random_state=0))

In [None]:
# Random Forest
modelRF, cm, AUC, f1, percision, recall, xTrain, xTest, yTrain, yTest = modeling(xFinal, xTestFinal, yFinal, yTestFinal, RandomForestClassifier(random_state=0))

In [None]:
# XGBoost
modelXGB, cm, AUC, f1, percision, recall, xTrain, xTest, yTrain, yTest = modeling(xFinal, xTestFinal, yFinal, yTestFinal, xgb.XGBClassifier(random_state=0, scale_pos_weight=0.1))

# Evaluation Data to Submit

In [None]:
yEvalPred = modelXGB.predict(xEvalImportant)

yEvalProb = modelXGB.predict_proba(xEvalImportant)
yEvalProbFraud = yEvalProb[:, 1]


In [None]:
xToSubmit = pd.concat([evalData['TRANSACTION_ID'], 
                       pd.Series(yEvalPred, name='PREDICTION'), 
                       pd.Series(yEvalProbFraud, name='PROBABILITY')], axis=1)

In [None]:
xToSubmit.to_csv("KO-WOO-REANS_prediction.csv")