In [17]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, plot_precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
# from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline


# modelling
- split clean data into train/validate and test (holdout) data sets. 
- scale, encode, prep training data for machine learning
- evaluate against validation data

In [2]:
transactions = pd.read_parquet('data/data_products/suspicious_transactions_dataset_201812-202001_cleaned.parquet')
transactions.sample(5)

Unnamed: 0_level_0,Source System,CountryCodeOffice,ML Risk Rating,Flag,product_type,date,month,is_month_start,is_month_end,weekday,quarter,gbp_log_amount,currency_reduced,suspicious_client,trusted_client,client_name,new_client,frequent_client
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
7689,alpha,GB,H,1,fx_spots,2019-02-26,2,False,False,1,1,3.198872,USD,False,True,Client_20,False,True
25001,alpha,GB,L,0,fx_spots,2019-11-27,11,False,False,2,4,2.53162,EUR,False,True,Client_4,False,True
16629,alpha,GB,L,0,fx_forwards,2019-07-29,7,False,False,0,3,3.846849,USD,False,True,Client_104,False,True
22013,alpha,GB,L,0,fx_swaps,2019-10-14,10,False,False,0,4,5.166675,JPY,False,True,Client_96,False,True
8508,alpha,GB,L,0,fx_spots,2019-03-11,3,False,False,0,1,4.993812,EUR,False,True,Client_4,False,True


In [3]:
feature_columns_catg = [
    'Source System',
    'CountryCodeOffice',
    'ML Risk Rating',
    'product_type',
    'currency_reduced',
]
feature_columns_bool = [
    'suspicious_client',
    'trusted_client',
    'new_client', 
    'frequent_client', 
    'is_month_start',
    'is_month_end',
]
feature_column_cont = [
    'month',
    'weekday',
    'quarter',
    'gbp_log_amount'
]
outcome_column = 'Flag'


# collect all values of categorical variables for the purpose of one hot encoding
all_systems    = transactions['Source System'].unique()
all_offices    = transactions['CountryCodeOffice'].unique()
all_ratings    = transactions['ML Risk Rating'].unique()
all_products   = transactions['product_type'].unique()
all_currencies = transactions['currency_reduced'].unique()
# all_clients    = transactions['client_name'].unique()
# transactions['is_month_start'].unique()
# transactions['is_month_end'].unique()
# transactions['suspicious_client'].unique()
# transactions['trusted_client'].unique()

all_categories = [
    all_systems,
    all_offices,
    all_ratings,
    all_products,
    all_currencies
]

## splitting data 
- need about 80% for train/validate, 20% holdout data (for performance gauging later)
- need to ensure flagged records present in each group. 
    + start by splitting flagged records up
    + split remaining records
    + combine splits

In [4]:
train_flag, test_flag = train_test_split(transactions[transactions['Flag']==1], test_size=0.2)
train_safe, test_safe = train_test_split(transactions[transactions['Flag']==0], test_size=0.2)
training_df = pd.concat([train_flag, train_safe]).sort_values(by='date')
testing_df  = pd.concat([test_flag, test_safe]).sort_values(by='date')

print(train_flag.shape, test_flag.shape, train_safe.shape, test_safe.shape, training_df.shape, testing_df.shape)

(135, 18) (34, 18) (24092, 18) (6023, 18) (24227, 18) (6057, 18)


In [5]:
# holdout data goes on ice:
testing_df.to_parquet('data/performance/holdout_data.parquet')

the "training" data needs to be further split into training and validation data. 

In [6]:
train_flag, valid_flag = train_test_split(training_df[training_df['Flag']==1], test_size=0.2)
train_safe, valid_safe = train_test_split(training_df[training_df['Flag']==0], test_size=0.2)
training_df    = pd.concat([train_flag, train_safe]).sort_values(by='date')
validation_df  = pd.concat([valid_flag, valid_safe]).sort_values(by='date')
print(train_flag.shape, valid_flag.shape, train_safe.shape, valid_safe.shape, training_df.shape, validation_df.shape)

(108, 18) (27, 18) (19273, 18) (4819, 18) (19381, 18) (4846, 18)


In [7]:
training_df[feature_columns_catg].shape

(19381, 5)

In [8]:
# categorical
encoder_catg = OneHotEncoder(categories=all_categories)
encoder_catg.fit(training_df[feature_columns_catg])
X_catg_train = encoder_catg.transform(training_df[feature_columns_catg]).toarray()

In [9]:
# continuous variable encoding
encoder_cont = StandardScaler().fit(training_df[feature_column_cont])
X_cont_train = encoder_cont.transform(np.array(training_df[feature_column_cont]))


In [10]:
X_train = np.concatenate([X_catg_train[:],X_cont_train[:],training_df[feature_columns_bool]], axis=1)
y_train = training_df[outcome_column]
# classifier = RandomForestClassifier(criterion='entropy', n_jobs=-1, n_estimators=200, min_samples_leaf=1, max_features='auto')
classifier = ExtraTreesClassifier(criterion='entropy', n_jobs=-1, n_estimators=200, min_samples_leaf=1,  max_features='auto', oob_score=True)
# classifier = RandomForestClassifier(criterion='gini', n_jobs=-1, n_estimators=200, min_samples_leaf=1, max_features='auto')
classifier = classifier.fit(X_train, y_train)
# classifier.predict()


In [11]:
X_catg_valid = encoder_catg.transform(validation_df[feature_columns_catg]).toarray()
X_cont_valid = encoder_cont.transform(np.array(validation_df[feature_column_cont]))
X_valid = np.concatenate([X_catg_valid[:],X_cont_valid[:],validation_df[feature_columns_bool]], axis=1)
y_valid = validation_df[outcome_column]

In [12]:
confusion_matrix(y_valid, classifier.predict(X_valid))

array([[4811,    8],
       [  19,    8]])

way too many false negatives.

In [13]:
confusion_matrix(y_valid, classifier.predict(X_valid), normalize='true')

array([[0.9983399, 0.0016601],
       [0.7037037, 0.2962963]])

In [14]:
pipe = Pipeline([
    ('encoder', encoder_catg),
    ('classif', RandomForestClassifier(       
            n_estimators=200,
            min_samples_leaf=1,
            
    )
    )
])

In [19]:
scores = cross_val_score(classifier, X_train, y_train, cv=15)
print(scores, scores.mean())
scores = cross_val_score(classifier, X_train, y_train, cv=15, scoring='f1_macro')
print("f1: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[0.99303944 0.98839009 0.9876161  0.98916409 0.99148607 0.99380805
 0.99148607 0.99767802 0.99458204 0.99767802 0.99613003 0.99226006
 0.99380805 0.9876161  0.98297214] 0.9918476243837381
f1: 0.61 (+/- 0.28)


In [16]:
pickle.dump(encoder_catg, open('model/encoder_catg.pckl', 'wb'))
pickle.dump(encoder_cont, open('model/encoder_cont.pckl', 'wb'))
pickle.dump(classifier,   open('model/random_forest_clf.pckl', 'wb'))

In [None]:
RandomizedSearchCV(classifier,)