In [76]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer

In [75]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [64]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)
train_transaction_id = train.TransactionId
train.drop(['TransactionId'], axis=1, inplace=True)

test_transaction_id = test.TransactionId
test.drop(['TransactionId'], axis=1, inplace=True)


In [65]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

train.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,BatchId_7941,AccountId_4841,SubscriptionId_3829,CustomerId_3403,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-70.0,70,2019-01-18T15:24:43Z,2
1,BatchId_77729,AccountId_4421,SubscriptionId_4038,CustomerId_4878,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,300000.0,300000,2018-12-20T16:58:30Z,2
2,BatchId_49288,AccountId_4841,SubscriptionId_3829,CustomerId_3634,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2018-12-12T18:16:27Z,2
3,BatchId_6696,AccountId_4841,SubscriptionId_3829,CustomerId_2651,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-01-25T05:03:41Z,2
4,BatchId_109336,AccountId_1336,SubscriptionId_1035,CustomerId_1699,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,10000.0,10000,2018-12-07T16:24:44Z,4


In [66]:
# hot_cols = ["ProductCategory", "ProviderId", "ChannelId", "Sign", "PricingStrategy"]
hot_cols = ["ProductCategory"]

preprocessor = Pipeline(steps = [
    ("shuffle", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    ("One hot encoding", OHTransformer(hot_cols)),
    ("Dropper", DropperTransformer()),
])

res = preprocessor.fit_transform(train)

# res["ChannelId4"] = 0

SMOTE = SMOTE()
smote_X, smote_Y = SMOTE.fit_resample(res, train_Y)

res.head()






Unnamed: 0,AccountId,ProviderId,ChannelId,Value,TransactionStartTime,PricingStrategy,TransactionStartDay,Sign,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,ProductCategory6,ProductCategory7,ProductCategory8
0,4841,4,2,70,152443.0,2,20190118.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,4421,6,3,300000,165830.0,2,20181220.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4841,4,2,50,181627.0,2,20181212.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4841,4,2,20,50341.0,2,20190125.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1336,5,3,10000,162444.0,4,20181207.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
test = preprocessor.fit_transform(test)
# ch4 = test.ChannelId4
# test.drop(["ChannelId4"], axis=1, inplace=True)
# test["ChannelId4"] = ch4
test.head()



Unnamed: 0,AccountId,ProviderId,ChannelId,Value,TransactionStartTime,PricingStrategy,TransactionStartDay,Sign,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,ProductCategory6,ProductCategory7,ProductCategory8
0,2441,5,3,1000,100140.0,4,20190213.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3439,5,3,2000,100212.0,2,20190213.0,True,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,4841,4,2,50,100230.0,2,20190213.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2685,5,3,3000,100238.0,4,20190213.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4841,4,2,60,100258.0,2,20190213.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [68]:
#import xgboost
from xgboost import XGBClassifier
# my_model = XGBClassifier(n_estimators=500)
# #change dtype of all collumns to float
X = smote_X.astype('float32')
Y = smote_Y.astype('float32')
# my_model.fit(X,Y)

# res_xg = my_model.predict(test.astype('float32'))


# output = pd.DataFrame()
# output["TransactionId"] = test_transaction_id
# output["FraudResult"] = res_xg

# output.to_csv("wtf.csv", index=False)



In [69]:
# output.to_csv("wtf.csv", index=False)

In [70]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()

param_grid = {'max_depth': [2, 4,40], 'n_estimators': [30,100,500], 'learning_rate': [0.1, 0.4, 0.5,0.7]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X, Y)

# On print les meilleurs hyperparamètres
print(grid_search.best_params_)

# {'learning_rate': 0.4, 'max_depth': 40, 'n_estimators': 500}

{'learning_rate': 0.4, 'max_depth': 40, 'n_estimators': 500}


In [74]:
print(grid_search.best_score_)
print(grid_search.best_estimator_)
print(grid_search.cv_results_)

resgrid = grid_search.best_estimator_.predict(test.astype('float32'))

output = pd.DataFrame()
output["TransactionId"] = test_transaction_id
output["FraudResult"] = resgrid

output.to_csv("wtf.csv", index=False)



0.9998062559896506
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.4, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=40, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
{'mean_fit_time': array([  0.87044091,   3.37588496,  16.65001116,   2.14391441,
         7.02623253,  35.14794097,   4.79242835,  14.39227128,
        48.67503924,   1.26933351,   3.88614564,  25.9864603 ,
         3.00018239,   8.

In [72]:

# model = XGBClassifier(early_stopping_rounds = 10)

# # param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [ 40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}

# param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,60,100,500], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}
# grid_search2 = GridSearchCV(model, param_grid, cv=5, scoring='f1')
# grid_search2.fit(X, Y)

# # On print les meilleurs hyperparamètres
# print(grid_search2.best_params_)

In [73]:
# model = XGBClassifier(,early_stopping_rounds = 100)

# # param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}
# param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,60,100,500], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}

# grid_search3 = GridSearchCV(model, param_grid, cv=5, scoring='f1')
# grid_search3.fit(X, Y)

# # On print les meilleurs hyperparamètres
# print(grid_search3.best_params_)

In [None]:
# split au tout debut avant feature engineering et donner le pipeline au gridshearch pour eviter leak et smote apres le split aussi
#peut etre split par jour de le semaine