**Simple data processing, no feature engineering, just encoding categoricals and imputing NAs.**

In [1]:
from FraudKagglePreProcessData import PreProcessData
from datetime import datetime
from datetime import datetime as dt
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
import os
from joblib import dump
from sklearn import metrics
import xgboost as xgb
warnings.filterwarnings("ignore")

<h2> Data Loading and prep <h2>

In [2]:
def load_and_merge_data(transaction_csv,identity_csv,isTrain):
    df_transaction = pd.read_csv(transaction_csv, index_col='TransactionID')
    df_identity = pd.read_csv(identity_csv, index_col='TransactionID')
    df = pd.merge(df_transaction, df_identity, on='TransactionID', how='left')
    del df_transaction
    del df_identity
    if isTrain:
        labels = df[['isFraud']]
        df.pop('isFraud')
    else:
        labels = []
    return df, labels


In [3]:
start=datetime.now()
train,labels  = load_and_merge_data('./data/raw/train_transaction.csv','./data/raw/train_identity.csv',isTrain=True)
validate,vallabels  = load_and_merge_data('./data/raw/test_transaction.csv','./data/raw/test_identity.csv',isTrain=False)
finish=datetime.now()
print('time taken: '+str(finish-start))

time taken: 0:01:11.474487


In [161]:
#train.to_pickle('./data/interim/train_joined.pkl')
#test.to_pickle('./data/interim/test_joined.pkl')
#labels.to_pickle('./data/interim/labels.pkl')
#train = pd.read_pickle('./data/interim/train_joined.pkl')
#test = pd.read_pickle('./data/interim/test_joined.pkl')
#labels = pd.read_pickle('./data/interim/labels.pkl')

In [4]:
def get_lists_of_numerical_categorical(df,regex):
    #Regex for categorical fields:
    categorical = []
    numerical = []

    #Create lists of categorical and numeircal fields:
    for i in df:
        if re.match(regex, i):
            categorical.append(i)
        else:
            numerical.append(i)
    return numerical,categorical
start=datetime.now()
cat_columns_regex='ProductCD|card[1-6]|addr\d|\w_emaildomain|M[1-9]|time_|Device\w+|id_12|id_13|id_14|id_15|id_16|id_17|id_18|id_19|id_20|id_21|id_22|id_23|id_24|id_25|id_26|id_27|id_28|id_29|id_30|id_31|id_32|id_33|id_34|id_35|id_36|id_37|id_38'
numerical,categorical = get_lists_of_numerical_categorical(train,cat_columns_regex)
finish=datetime.now()
print('time taken: '+str(finish-start))

time taken: 0:00:00.002988


In [5]:
start=datetime.now()
def numerically_encode_string_categoricals(df):
    for i in df.columns:
        if df[i].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[i].values) + list(df[i].values))
            df[i] = lbl.transform(list(df[i].values))
    return df
train = numerically_encode_string_categoricals(train)
validate = numerically_encode_string_categoricals(validate)
finish=datetime.now()
print('time taken: '+str(finish-start))

time taken: 0:03:27.677484


In [6]:
# From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
# WARNING! THIS CAN DAMAGE THE DATA 
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
start=datetime.now()
train = reduce_mem_usage(train)
finish=datetime.now()
print('time taken: '+str(finish-start))
#validate = reduce_mem_usage(validate)

Memory usage of dataframe is 1950.87 MB
Memory usage after optimization is: 527.14 MB
Decreased by 73.0%
time taken: 0:03:37.732491


Impute missing values on numerical features with median and categorical with mode

In [8]:
start=datetime.now()
def impute_cat_and_num(df,numerical,categorical):
    fill_NaN_numerical = SimpleImputer(missing_values=np.nan, strategy='median')
    fill_NaN_categorical = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    df[numerical] = fill_NaN_numerical.fit_transform(df[numerical])
    df[categorical] = fill_NaN_categorical.fit_transform(df[categorical])
    return df
train = impute_cat_and_num(train,numerical,categorical)
validate = impute_cat_and_num(validate,numerical,categorical)
finish=datetime.now()
print('time taken: '+str(finish-start))

time taken: 0:05:46.375678


In [9]:
start=datetime.now()
train = reduce_mem_usage(train)
finish=datetime.now()
print('time taken: '+str(finish-start))

Memory usage of dataframe is 1950.87 MB
Memory usage after optimization is: 542.91 MB
Decreased by 72.2%
time taken: 0:03:17.304864


In [166]:
#train.to_pickle('./data/interim/train_joined.pkl')
#test.to_pickle('./data/interim/test_joined.pkl')
#labels.to_pickle('./data/interim/labels.pkl')
#train = pd.read_pickle('./data/interim/train_joined.pkl')
#test = pd.read_pickle('./data/interim/test_joined.pkl')
#labels = pd.read_pickle('./data/interim/labels.pkl')

Split train into train and test for validating initial model

In [10]:
start=datetime.now()
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.2,random_state=42)
finish=datetime.now()
print('time taken: '+str(finish-start))

time taken: 0:00:03.531045


<h3> Random Forest <h3>

In [168]:
rf = RandomForestClassifier(random_state=42,n_estimators = 100, max_depth=25)
rf.fit(X_train, y_train)
finish=datetime.now()
print('time taken: '+str(finish-start))
important_features = pd.DataFrame(rf.feature_importances_,features).sort_values(0,ascending=False)
important_features.head(5)

time taken: 0:00:00.026927


Unnamed: 0,0
V203,0.071066
V186,0.066499
V232,0.059997
id_13,0.052245
V123,0.048959


In [169]:
y_pred_prob=rf.predict_proba(X_test)
y_pred=rf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("ROC area under:",metrics.roc_auc_score(y_test, y_pred_prob[:,1]))

Accuracy: 0.99
ROC area under: 0.7244897959183674


In [170]:
all_results = pd.read_pickle('./data/interim/grid_search_results.pkl')

**Grid search using cross validation for hyperparameter tuning**

In [171]:
rf = RandomForestClassifier(random_state = 42)
param = {'n_estimators': [300, 500],
        'max_depth': [50]}

gs = GridSearchCV(rf, param, cv=3, n_jobs=-1,scoring='roc_auc',verbose=2)
gs_fit = gs.fit(X_train, y_train)

results = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)

if 'all_results' in locals():
    all_results = pd.concat([all_results,results])
else:
    all_results = results

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:    2.1s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    2.3s finished


In [172]:
all_results[['params','mean_fit_time','mean_test_score']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_fit_time,mean_test_score
0,"{'max_depth': 50, 'n_estimators': 300}",0.462763,0.973303
0,"{'max_depth': 50, 'n_estimators': 300}",0.234372,0.973303
0,"{'max_depth': 50, 'n_estimators': 300}",0.222405,0.973303
0,"{'max_depth': 50, 'n_estimators': 300}",0.233708,0.973303
1,"{'max_depth': 50, 'n_estimators': 500}",0.728053,0.971394
1,"{'max_depth': 50, 'n_estimators': 500}",0.366038,0.971394
1,"{'max_depth': 50, 'n_estimators': 500}",0.349732,0.971394
1,"{'max_depth': 50, 'n_estimators': 500}",0.369346,0.971394


In [173]:
all_results.to_pickle('./data/interim/grid_search_results.pkl')

**Train new model with best hyper parameters from above**

In [174]:
%time
rf = RandomForestClassifier(random_state=42,n_estimators = 500, max_depth=25)
rf.fit(train, labels)
finish=datetime.now()

Wall time: 0 ns


**get predictions on un-labelled data:**

In [175]:
val_pred=rf.predict_proba(validate)

In [176]:
datetimestring=dt.now().strftime("%d-%m-%Y_%H-%M-%S")
os.mkdir('./data/predictions/'+datetimestring)
validate['isFraud'] = val_pred[:,1]
validate['TransactionID']=list(validate.index)
validate[['TransactionID','isFraud']].to_csv('./data/predictions/'+datetimestring+'/prediction_results.csv',index = False)
dump(rf, './data/predictions/'+datetimestring+'/random_foest_base.joblib')

['./data/predictions/13-09-2019_12-15-36/random_foest_base.joblib']

<h3>XGBoost<h3>

In [None]:
start=datetime.now()
xgb = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=9,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        tree_method='auto'
    )

xgb.fit(X_train, y_train)
finish=datetime.now()
print('time taken: '+str(finish-start))

In [178]:
y_pred_prob=xgb.predict_proba(X_test)
y_pred=xgb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("ROC area under:",metrics.roc_auc_score(y_test, y_pred_prob[:,1]))

Accuracy: 0.99
ROC area under: 0.8316326530612245


In [179]:
validate=validate.drop(['isFraud','TransactionID'], axis=1)
val_pred=xgb.predict_proba(validate)

In [180]:
datetimestring=dt.now().strftime("%d-%m-%Y_%H-%M-%S")
os.mkdir('./data/predictions/'+datetimestring)
validate['isFraud'] = val_pred[:,1]
validate['TransactionID']=list(validate.index)
validate[['TransactionID','isFraud']].to_csv('./data/predictions/'+datetimestring+'/prediction_results.csv',index = False)
dump(rf, './data/predictions/'+datetimestring+'/XGBoost.joblib')

['./data/predictions/13-09-2019_12-15-38/XGBoost.joblib']