In [128]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer
from sklearn.model_selection import train_test_split

In [129]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [130]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)
train_transaction_id = train.TransactionId
train.drop(['TransactionId'], axis=1, inplace=True)

test_transaction_id = test.TransactionId
test.drop(['TransactionId'], axis=1, inplace=True)



features = ["BatchId","AccountId","SubscriptionId","CustomerId","ProviderId","ProductId","ProductCategory","ChannelId","Amount","Value","TransactionStartTime","PricingStrategy"]


In [131]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

train.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,BatchId_110408,AccountId_4841,SubscriptionId_3829,CustomerId_4840,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-02-08T11:32:33Z,2
1,BatchId_30008,AccountId_968,SubscriptionId_2414,CustomerId_1317,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,5000.0,5000,2019-02-03T10:14:49Z,2
2,BatchId_31079,AccountId_318,SubscriptionId_3087,CustomerId_647,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,1000.0,1000,2018-12-27T05:37:51Z,2
3,BatchId_59150,AccountId_1228,SubscriptionId_4192,CustomerId_1588,UGX,256,ProviderId_1,ProductId_15,financial_services,ChannelId_3,5000.0,5000,2018-12-03T11:30:43Z,2
4,BatchId_82899,AccountId_318,SubscriptionId_3087,CustomerId_647,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,500.0,500,2019-01-05T04:39:40Z,2


In [132]:
# hot_cols = ["ProductCategory", "ProviderId", "ChannelId", "Sign", "PricingStrategy"]
hot_cols = ["ProductCategory"]


preprocessor = skPipeline(steps = [
    ("shuffle", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    ("One hot encoding", OHTransformer(hot_cols)),
    ("Dropper", DropperTransformer()),

])

processed_train= preprocessor.fit_transform(train)

X_train, X_validation, Y_train, Y_validation = train_test_split(processed_train, train_Y, test_size=0.20)

# res["ChannelId4"] = 0

SMOTE = SMOTE()
smote_X, smote_Y = SMOTE.fit_resample(X_train, Y_train)

X_validation.head()

# res.to_csv("wtf.csv", index=False)






Unnamed: 0,AccountId,ProviderId,ChannelId,Value,TransactionStartTime,PricingStrategy,TransactionStartDay,Sign,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,ProductCategory6,ProductCategory7,ProductCategory8
81062,4841,4,2,5000,161006.0,2,20181207.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
44326,678,6,3,1000,161350.0,2,20181129.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43918,4841,4,2,50,43652.0,2,20190211.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29969,4249,4,2,10000,171748.0,4,20181204.0,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
64351,2443,5,3,11200,164406.0,2,20181207.0,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
test = preprocessor.fit_transform(test)
# ch4 = test.ChannelId4
# test.drop(["ChannelId4"], axis=1, inplace=True)
# test["ChannelId4"] = ch4
test.head()



Unnamed: 0,AccountId,ProviderId,ChannelId,Value,TransactionStartTime,PricingStrategy,TransactionStartDay,Sign,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,ProductCategory6,ProductCategory7,ProductCategory8
0,2441,5,3,1000,100140.0,4,20190213.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3439,5,3,2000,100212.0,2,20190213.0,True,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,4841,4,2,50,100230.0,2,20190213.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2685,5,3,3000,100238.0,4,20190213.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4841,4,2,60,100258.0,2,20190213.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [134]:
#import xgboost
from xgboost import XGBClassifier
# my_model = XGBClassifier(n_estimators=500)
# #change dtype of all collumns to float
X_train = smote_X.astype('float32')
Y_train = smote_Y.astype('float32')
# my_model.fit(X,Y)

# res_xg = my_model.predict(test.astype('float32'))


# output = pd.DataFrame()
# output["TransactionId"] = test_transaction_id
# output["FraudResult"] = res_xg

# output.to_csv("wtf.csv", index=False)



In [135]:
# output.to_csv("wtf.csv", index=False)

In [145]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()

# param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [30, 40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}
param_grid = {'max_depth': [2, 4,40], 'n_estimators': [100,500], 'learning_rate': [0.4, 0.5]}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X, Y)

# On print les meilleurs hyperparamètres
print(grid_search.best_params_)

# {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 500}

KeyboardInterrupt: 

In [137]:
from sklearn.model_selection import train_test_split

In [148]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score

# model = XGBClassifier(early_stopping_rounds = 10, eval_metric='rmse')
model = XGBClassifier(early_stopping_rounds = 10, eval_metric='auc')



# X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20)

# param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [ 40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}

# param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,300,500,600], 'learning_rate': [ 0.3,0.4,0.5,0.6]}
param_grid = {'max_depth': [4], 'n_estimators': [500,600], 'learning_rate': [ 0.4,0.5]}

grid_search2 = GridSearchCV(model, param_grid, scoring='f1')
grid_search2.fit(X_train, Y_train, eval_set=[ (X_validation.astype('float32'), Y_validation.astype('float32'))])

# , eval_set=[ (X_validation, Y_validation)]

# On print les meilleurs hyperparamètres
print(grid_search2.best_params_)
# {'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 500}

[0]	validation_0-auc:0.99560
[1]	validation_0-auc:0.99961
[2]	validation_0-auc:0.99966
[3]	validation_0-auc:0.99967
[4]	validation_0-auc:0.99967
[5]	validation_0-auc:0.99967
[6]	validation_0-auc:0.99972
[7]	validation_0-auc:0.99991
[8]	validation_0-auc:0.99991
[9]	validation_0-auc:0.99991
[10]	validation_0-auc:0.99992
[11]	validation_0-auc:0.99992
[12]	validation_0-auc:0.99995
[13]	validation_0-auc:0.99994
[14]	validation_0-auc:0.99999
[15]	validation_0-auc:0.99999
[16]	validation_0-auc:0.99998
[17]	validation_0-auc:0.99999
[18]	validation_0-auc:0.99998
[19]	validation_0-auc:0.99998
[20]	validation_0-auc:0.99997
[21]	validation_0-auc:0.99998
[22]	validation_0-auc:0.99998
[23]	validation_0-auc:0.99997
[24]	validation_0-auc:0.99999
[25]	validation_0-auc:0.99998
[26]	validation_0-auc:0.99998
[27]	validation_0-auc:0.99999
[0]	validation_0-auc:0.99560
[1]	validation_0-auc:0.99970
[2]	validation_0-auc:0.99970
[3]	validation_0-auc:0.99970
[4]	validation_0-auc:0.99973
[5]	validation_0-auc:0.99

In [149]:
print(grid_search2.best_score_)

0.999489590883986


In [None]:
# model = XGBClassifier(early_stopping_rounds = 10,n_estimators=500,learning_rate=0.5,max_depth=4)

# X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20)
# model.fit(X_train,Y_train,eval_set=[ (X_validation, Y_validation)])

resgrid2 = grid_search2.best_estimator_.predict(test.astype('float32'))

# res_xg = my_model.predict(test.astype('float32'))


output = pd.DataFrame()
output["TransactionId"] = test_transaction_id
output["FraudResult"] = resgrid2

output.to_csv("wtf.csv", index=False)

In [None]:
# model = XGBClassifier(early_stopping_rounds = 100)

# # param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}
# # param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,60,100,500], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}
# param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,500,600,700], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}

# grid_search3 = GridSearchCV(model, param_grid, cv=5, scoring='f1')
# grid_search3.fit(X_train, Y_train,eval_set=[ (X_validation, Y_validation)])

# # On print les meilleurs hyperparamètres
# print(grid_search3.best_params_)

# {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 600}