In [59]:
# import modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

class select_and_order_cols(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X_ = X.copy()
        
        cols_order = oli.columns.to_list()
        cols_order.remove('isFraud')

        temp = X_[cols_order]
        
        return temp


class emailTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X_ = X.copy()

        temp = X_['P_emaildomain'].copy()

        valid_emails = ['gmail.com', 'hotmail.com', 'anonymous.com', 'yahoo.com']
        
        temp[~(temp.isin(valid_emails))] = 'other'
        
        X_['P_emaildomain'] = temp
        
        return X_
        
specific = ['id_30','id_33','DeviceInfo']

class id_30_transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X, y=None):
        return
    
    def transform(self, X):
        X_ = X.copy()

        temp = X_['id_30'].copy()

        valid_types = ['Windows 10', 'Windows 7', 'iOS 11.2.1', 
                       'iOS 11.1.2', 'Android 7.0', 'Mac OS X 10_12_6', 'Mac OS X 10_11_6']
        
        temp[~(temp.isin(valid_types))] = 'other'
        
        X_['id_30'] = temp
        
        return X_
        
    
class id_33_transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return

    def fit(self, X, y=None):
        return
    
    def transform(self, X):

        X_ = X.copy()

        temp = X_['id_33'].copy()

        valid_types = ['1920x1080', '1366x768', '1334x750', '2208x1242', '1440x900',
                       '1600x900', '2048x1536', '1280x800', '2560x1600', '2560x1440',
                       '2880x1800', '1280x1024', '1680x1050', '1136x640']

        
        temp[~(temp.isin(valid_types))] = 'other'
        
        X_['id_33'] = temp
        
        return X_
        
        
class DeviceInfo_transformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    
    def fit(self, X, y=None):
        return
    
    def transform(self, X):

        X_ = X.copy()

        temp = X_['DeviceInfo'].copy()
        
        valid_types = ['Windows', 'iOS Device', 'MacOS', 'Trident/7.0']
        
        temp[~(temp.isin(valid_types))] = 'other'
        
        X_['DeviceInfo'] = temp
        
        return X_

In [61]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(fill_value=-9999))
])

In [62]:
cat_pipeline = Pipeline([
    #('imp', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [63]:
from sklearn.compose import ColumnTransformer

num_attribs = ['id_17', 'TransactionDT', 'card3']
cat_attribs = ['id_15', 'id_35', 'id_16', 'id_28', 'id_29', 'id_30', 'id_33', 'DeviceInfo', 'ProductCD', 'P_emaildomain']

first_pipeline = Pipeline([
    ('email', emailTransformer()),
    ('id_30', id_30_transformer()),
    ('id_33', id_33_transformer()),
    ('devinfo', DeviceInfo_transformer()),
])

final_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])


In [64]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(bootstrap=False)

In [65]:
train_t = pd.read_csv('Data/initial/train_transaction.csv')
train_i = pd.read_csv('Data/initial/train_identity.csv')

cols_to_use = train_i.columns.difference(train_t.columns).to_list()

cols_to_use.append('TransactionID')

full_train = train_t.merge(train_i[cols_to_use], on='TransactionID', how='left')

cols = ['isFraud', 'TransactionID', 'id_17', 'id_35', 'TransactionDT', 'card3', 'id_15', 'id_16', 'id_28', 'id_29', 
        'id_30', 'id_33', 'DeviceInfo', 'ProductCD', 'P_emaildomain']

train = full_train[cols]

In [66]:
y = train['isFraud']

X = train.drop('isFraud', axis=1)

X = first_pipeline.transform(X)

X.head()

Unnamed: 0,TransactionID,id_17,id_35,TransactionDT,card3,id_15,id_16,id_28,id_29,id_30,id_33,DeviceInfo,ProductCD,P_emaildomain
0,2987000,,,86400,150.0,,,,,other,other,other,W,other
1,2987001,,,86401,150.0,,,,,other,other,other,W,gmail.com
2,2987002,,,86469,150.0,,,,,other,other,other,W,other
3,2987003,,,86499,150.0,,,,,other,other,other,W,yahoo.com
4,2987004,166.0,T,86506,150.0,New,NotFound,New,NotFound,Android 7.0,other,other,H,gmail.com


In [83]:
y = train['isFraud']

X = train.copy()

X = first_pipeline.transform(X)

X = pd.get_dummies(X)

X.fillna(-9999, inplace=True)

X.to_csv("Data/r/train.csv")

In [67]:
X = pd.get_dummies(X)

X.fillna(-9999, inplace=True)

X.head()

Unnamed: 0,TransactionID,id_17,TransactionDT,card3,id_35_F,id_35_T,id_15_Found,id_15_New,id_15_Unknown,id_16_Found,...,ProductCD_C,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,P_emaildomain_anonymous.com,P_emaildomain_gmail.com,P_emaildomain_hotmail.com,P_emaildomain_other,P_emaildomain_yahoo.com
0,2987000,-9999.0,86400,150.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,2987001,-9999.0,86401,150.0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,2987002,-9999.0,86469,150.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,2987003,-9999.0,86499,150.0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,2987004,166.0,86506,150.0,0,1,0,1,0,0,...,0,1,0,0,0,0,1,0,0,0


In [68]:
forest.fit(X,y)

RandomForestClassifier(bootstrap=False)

In [69]:
L = len(X)

subset = X[:int(L*0.5)]
subset_y = y[:int(L*0.5)]

new_for = RandomForestClassifier(bootstrap=False, max_features=8, n_estimators=20)

In [70]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(new_for, subset, subset_y, scoring='roc_auc')

np.mean(scores)

0.37284037551755966

In [71]:
test_identity = pd.read_csv('Data/initial/test_identity.csv')
test_transaction = pd.read_csv('Data/initial/test_transaction.csv')

In [72]:
cols_to_use = test_identity.columns.difference(test_transaction.columns).to_list()

cols_to_use.append('TransactionID')

In [73]:
full_test = test_transaction.merge(test_identity[cols_to_use], on='TransactionID', how='left')

In [74]:
cols.remove('isFraud')

In [82]:
test = full_test[cols]

X_test = first_pipeline.transform(test)

X_test = pd.get_dummies(X_test)

X_test.fillna(-99999, inplace=True)

X_test.to_csv('Data/r/test.csv')

y_preds = forest.predict(X_test)

In [76]:
sum(y_preds)

4390

In [77]:
Y_preds = pd.Series(y_preds, name = 'isFraud')

In [78]:
output = pd.concat([full_test['TransactionID'], Y_preds], axis=1)

In [79]:
output.to_csv("Data/try2.csv", index=False)