In [54]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer

In [55]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [56]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)
train_transaction_id = train.TransactionId
train.drop(['TransactionId'], axis=1, inplace=True)

test_transaction_id = test.TransactionId
test.drop(['TransactionId'], axis=1, inplace=True)



features = ["BatchId","AccountId","SubscriptionId","CustomerId","ProviderId","ProductId","ProductCategory","ChannelId","Amount","Value","TransactionStartTime","PricingStrategy"]


In [57]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

train.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,BatchId_103696,AccountId_4841,SubscriptionId_3829,CustomerId_3638,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-5000.0,5000,2019-02-08T18:30:19Z,2
1,BatchId_15015,AccountId_3124,SubscriptionId_901,CustomerId_3551,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,5000.0,5000,2018-12-27T12:03:40Z,2
2,BatchId_100777,AccountId_4841,SubscriptionId_3829,CustomerId_1275,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-5000.0,5000,2019-01-25T15:24:13Z,2
3,BatchId_2162,AccountId_4742,SubscriptionId_1107,CustomerId_5211,UGX,256,ProviderId_3,ProductId_15,financial_services,ChannelId_3,40000.0,40000,2019-01-18T08:15:48Z,2
4,BatchId_64563,AccountId_769,SubscriptionId_4636,CustomerId_1114,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1800.0,1800,2019-01-18T07:18:56Z,4


In [58]:
# hot_cols = ["ProductCategory", "ProviderId", "ChannelId", "Sign", "PricingStrategy"]
hot_cols = ["ProductCategory"]

# train["ProductCategory"].values.reshape(-1,1)

# hot_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# onehotProcessor = ColumnTransformer(transformers=[
#     ('one', hot_transformer, hot_cols)
# ])

preprocessor = Pipeline(steps = [
    ("shuffle", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    ("One hot encoding", OHTransformer(hot_cols)),
    ("Dropper", DropperTransformer()),

])

res = preprocessor.fit_transform(train)

# res["ChannelId4"] = 0

SMOTE = SMOTE()
smote_X, smote_Y = SMOTE.fit_resample(res, train_Y)

res.head()

# res.to_csv("wtf.csv", index=False)




Unnamed: 0,AccountId,ProviderId,ChannelId,Value,TransactionStartTime,PricingStrategy,TransactionStartDay,Sign,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,ProductCategory6,ProductCategory7,ProductCategory8
0,4841,4,2,5000,183019.0,2,20190208.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,3124,6,3,5000,120340.0,2,20181227.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4841,4,2,5000,152413.0,2,20190125.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4742,3,3,40000,81548.0,2,20190118.0,True,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,769,5,3,1800,71856.0,4,20190118.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
test = preprocessor.fit_transform(test)
# ch4 = test.ChannelId4
# test.drop(["ChannelId4"], axis=1, inplace=True)
# test["ChannelId4"] = ch4
test.head()

Unnamed: 0,AccountId,ProviderId,ChannelId,Value,TransactionStartTime,PricingStrategy,TransactionStartDay,Sign,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,ProductCategory6,ProductCategory7,ProductCategory8
0,2441,5,3,1000,100140.0,4,20190213.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3439,5,3,2000,100212.0,2,20190213.0,True,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,4841,4,2,50,100230.0,2,20190213.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2685,5,3,3000,100238.0,4,20190213.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4841,4,2,60,100258.0,2,20190213.0,False,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [60]:
#import xgboost
from xgboost import XGBClassifier
# my_model = XGBClassifier(n_estimators=500)
# #change dtype of all collumns to float
X = smote_X.astype('float32')
Y = smote_Y.astype('float32')
# my_model.fit(X,Y)

# res_xg = my_model.predict(test.astype('float32'))


# output = pd.DataFrame()
# output["TransactionId"] = test_transaction_id
# output["FraudResult"] = res_xg

# output.to_csv("wtf.csv", index=False)



In [61]:
# output.to_csv("wtf.csv", index=False)

In [62]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()

# param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [30, 40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}
# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
# grid_search.fit(X, Y)

# On print les meilleurs hyperparamètres
# print(grid_search.best_params_)

# {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 500}

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
from sklearn.model_selection import train_test_split

model = XGBClassifier(early_stopping_rounds = 10)

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20)

# param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [ 40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}

param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,300,500,600], 'learning_rate': [ 0.3,0.4,0.5,0.6]}
grid_search2 = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search2.fit(X_train, Y_train,eval_set=[ (X_validation, Y_validation)])

# On print les meilleurs hyperparamètres
print(grid_search2.best_params_)
# {'learning_rate': 0.5, 'max_depth': 4, 'n_estimators': 500}

[0]	validation_0-logloss:0.44937
[1]	validation_0-logloss:0.31485
[2]	validation_0-logloss:0.23081
[3]	validation_0-logloss:0.17469
[4]	validation_0-logloss:0.13621
[5]	validation_0-logloss:0.10927
[6]	validation_0-logloss:0.09006
[7]	validation_0-logloss:0.07604
[8]	validation_0-logloss:0.06549
[9]	validation_0-logloss:0.05752
[10]	validation_0-logloss:0.05025
[11]	validation_0-logloss:0.04486
[12]	validation_0-logloss:0.04117
[13]	validation_0-logloss:0.03755
[14]	validation_0-logloss:0.03527
[15]	validation_0-logloss:0.03096
[16]	validation_0-logloss:0.02863
[17]	validation_0-logloss:0.02589
[18]	validation_0-logloss:0.02425
[19]	validation_0-logloss:0.02207
[20]	validation_0-logloss:0.02051
[21]	validation_0-logloss:0.01928
[22]	validation_0-logloss:0.01657
[23]	validation_0-logloss:0.01565
[24]	validation_0-logloss:0.01464
[25]	validation_0-logloss:0.01317
[26]	validation_0-logloss:0.01259
[27]	validation_0-logloss:0.01202
[28]	validation_0-logloss:0.01097
[29]	validation_0-loglos

In [67]:
# model = XGBClassifier(early_stopping_rounds = 10,n_estimators=500,learning_rate=0.5,max_depth=4)

# X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.20)
# model.fit(X_train,Y_train,eval_set=[ (X_validation, Y_validation)])

resgrid2 = grid_search2.best_estimator_.predict(test.astype('float32'))

# res_xg = my_model.predict(test.astype('float32'))


output = pd.DataFrame()
output["TransactionId"] = test_transaction_id
output["FraudResult"] = resgrid2

output.to_csv("wtf.csv", index=False)

In [66]:
# model = XGBClassifier(early_stopping_rounds = 100)

# # param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}
# # param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,60,100,500], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}
# param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,500,600,700], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}

# grid_search3 = GridSearchCV(model, param_grid, cv=5, scoring='f1')
# grid_search3.fit(X_train, Y_train,eval_set=[ (X_validation, Y_validation)])

# # On print les meilleurs hyperparamètres
# print(grid_search3.best_params_)

# {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 600}