Function to use methods from FraudKagglePreProcessData module to process train or test data.  Saves result as a df and as a pickled file for easy loading later.

In [37]:
from FraudKagglePreProcessData import PreProcessData
from datetime import datetime
def process_data(transaction_path,identity_path,name,isTrain,model_feature_path=None):
    start=datetime.now()
    print(str(datetime.now())+' load_and_merge_data...')
    df,labels  = PreProcessData.load_and_merge_data(transaction_path,identity_path,isTrain=isTrain)
    print('df shape: ' + str(df.shape))
    
    print(str(datetime.now())+' process_dates...')
    df = PreProcessData.process_dates(df)
    
    print(str(datetime.now())+' process_device_names...')
    df = PreProcessData.process_device_names(df)
    
    print(str(datetime.now())+' get_lists_of_numerical_categorical...')
    numerical,categorical = PreProcessData.get_lists_of_numerical_categorical(df,'ProductCD|card[1-6]|addr\d|\w_emaildomain|M[1-9]|time_|Device\w+|id_12|id_13|id_14|id_15|id_16|id_17|id_18|id_19|id_20|id_21|id_22|id_23|id_24|id_25|id_26|id_27|id_28|id_29|id_30|id_31|id_32|id_33|id_34|id_35|id_36|id_37|id_38')
    
    print(str(datetime.now())+' convert_numerical_categorical_to_strings...')
    df = PreProcessData.convert_numerical_categorical_to_strings(df,'ProductCD|card[1-6]|addr\d|\w_emaildomain|M[1-9]|time_|Device\w+|id_12|id_13|id_14|id_15|id_16|id_17|id_18|id_19|id_20|id_21|id_22|id_23|id_24|id_25|id_26|id_27|id_28|id_29|id_30|id_31|id_32|id_33|id_34|id_35|id_36|id_37|id_38')
    
    print(str(datetime.now())+' impute_missing_values...')
    df = PreProcessData.impute_missing_values(df,numerical,categorical)
    
    print(str(datetime.now())+' reduce_columns_with_PCA...')
    df,numerical,categorical = PreProcessData.reduce_columns_with_PCA(df,'^V.*',30,numerical,categorical)
    
    print(str(datetime.now())+' assign_low_freq_values_as_other_in_df...')
    df = PreProcessData.assign_low_freq_values_as_other_in_df(df,50,categorical)
    print('df shape: ' + str(df.shape))
    
    print(str(datetime.now())+' scale_numerical_fields...')
    df = PreProcessData.scale_numerical_fields(df,numerical)
    
    print(str(datetime.now())+' one_hot_encode_and_merge_with_numerical...')
    df = PreProcessData.one_hot_encode_and_merge_with_numerical(df,numerical,categorical,labels,isTrain)
    print('df shape: ' + str(df.shape))
    
    print(str(datetime.now())+' reduce_mem_usage...')
    df,NAs = PreProcessData.reduce_mem_usage(df)
          
    if isTrain == False:
        print(str(datetime.now())+' reconcile_features_from_test_to_train...')
        df = PreProcessData.reconcile_features_from_test_to_train(df,model_feature_path)
    
    print(str(datetime.now())+' pickle_df_and_columns...')
    PreProcessData.pickle_df_and_columns(df,name,isTrain)
    finish=datetime.now()
    print('time taken: '+str(finish-start))
    return df
#train = process_data('./data/raw/train_transaction_med.csv','./data/raw/train_identity_med.csv','train_med_processed',isTrain=True)
#validate = process_data('./data/raw/test_transaction_med.csv','./data/raw/test_identity_med.csv','test_med_processed',isTrain=False,model_feature_path='./data/processed/train_med_processed_feature_names.pkl')

#train = process_data('./data/raw/train_transaction.csv','./data/raw/train_identity.csv','train__processed',isTrain=True)
validate = process_data('./data/raw/test_transaction.csv','./data/raw/test_identity.csv','test_processed',isTrain=False,model_feature_path='./data/processed/train_processed_feature_names.pkl')

2019-09-05 13:29:29.878206 load_and_merge_data...
df shape: (506691, 432)
2019-09-05 13:29:54.997928 process_dates...
2019-09-05 13:29:59.549813 process_device_names...
2019-09-05 13:30:07.063725 get_lists_of_numerical_categorical...
2019-09-05 13:30:07.064722 convert_numerical_categorical_to_strings...
2019-09-05 13:30:17.540899 impute_missing_values...




2019-09-05 13:30:49.385464 reduce_columns_with_PCA...
2019-09-05 13:31:04.011893 assign_low_freq_values_as_other_in_df...
df shape: (506691, 125)
2019-09-05 13:34:59.123403 scale_numerical_fields...
2019-09-05 13:35:05.619743 one_hot_encode_and_merge_with_numerical...
df shape: (506691, 1182)
2019-09-05 13:35:11.403272 reduce_mem_usage...
Memory usage of properties dataframe is : 841.9541463851929  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  700.8544340133667  MB
This is  83.24140180583264 % of the initial size
2019-09-05 13:35:46.299778 reconcile_features_from_test_to_train...
2019-09-05 13:36:31.761861 pickle_df_and_columns...
time taken: 0:07:06.908495


Create base model on train data

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import pandas as pd
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(train.loc[:, train.columns != 'isFraud'], train['isFraud'], test_size=0.2,random_state=42)
rf = RandomForestClassifier(random_state=42,n_estimators = 10, max_depth=25)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

Evaluate model with test data

In [None]:
y_pred_prob=rf.predict_proba(X_test)
y_pred=rf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("ROC area under:",metrics.roc_auc_score(y_test, y_pred_prob[:,1]))

Predict on validate data with model

In [35]:
y_pred=rf.predict_proba(validate)

ValueError: Number of features of the model must match the input. Model n_features is 842 and input n_features is 531 

Grid search CV on train data 

In [16]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
rf = RandomForestClassifier(random_state = 42)
param = {'n_estimators': [10,500,1000],
        'max_depth': [25,50]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1,scoring='roc_auc')
gs_fit = gs.fit(X_train, y_train)
pd.DataFrame(gs_fit.cv_results_)[['params','mean_fit_time','mean_test_score']].sort_values('mean_test_score', ascending=False)
#grid_results=[]
#grid_results = pd.concat([grid_results,pd.DataFrame(gs_fit.cv_results_)]).sort_values('mean_test_score', ascending=False)
#grid_results = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)
#grid_results.to_pickle("./grid_search_results.pkl")
#grid_results[['params','mean_fit_time','mean_test_score']]

Unnamed: 0,params,mean_fit_time,mean_test_score
1,"{'max_depth': 25, 'n_estimators': 500}",1.195341,0.958123
4,"{'max_depth': 50, 'n_estimators': 500}",1.144995,0.958123
2,"{'max_depth': 25, 'n_estimators': 1000}",2.388538,0.953047
5,"{'max_depth': 50, 'n_estimators': 1000}",1.940876,0.953047
0,"{'max_depth': 25, 'n_estimators': 10}",0.035347,0.791869
3,"{'max_depth': 50, 'n_estimators': 10}",0.038656,0.791869


In [14]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

                          test:   1.7MiB
                         train: 538.5KiB
                       X_train: 414.2KiB
                        X_test: 103.8KiB
                        y_pred:   7.9KiB
                       y_train:   3.5KiB
        RandomForestClassifier:   2.0KiB
                           _i2:   1.8KiB
                           _i1:   1.8KiB
                           _i6:   1.8KiB
