In [32]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer

In [33]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [34]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)
train_transaction_id = train.TransactionId
train.drop(['TransactionId'], axis=1, inplace=True)

test_transaction_id = test.TransactionId
test.drop(['TransactionId'], axis=1, inplace=True)



features = ["BatchId","AccountId","SubscriptionId","CustomerId","ProviderId","ProductId","ProductCategory","ChannelId","Amount","Value","TransactionStartTime","PricingStrategy"]


In [35]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

train.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,BatchId_1478,AccountId_4841,SubscriptionId_3829,CustomerId_2815,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-100.0,100,2019-01-01T09:25:02Z,2
1,BatchId_6942,AccountId_4841,SubscriptionId_3829,CustomerId_1709,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-75.0,75,2018-11-24T17:42:11Z,2
2,BatchId_138693,AccountId_4249,SubscriptionId_4429,CustomerId_7343,UGX,256,ProviderId_4,ProductId_10,airtime,ChannelId_2,-25000.0,25000,2019-01-08T11:00:47Z,4
3,BatchId_101346,AccountId_460,SubscriptionId_2976,CustomerId_790,UGX,256,ProviderId_3,ProductId_15,financial_services,ChannelId_3,20000.0,20000,2019-01-30T17:31:39Z,2
4,BatchId_17999,AccountId_2231,SubscriptionId_2531,CustomerId_2643,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,500.0,500,2019-01-07T11:24:03Z,2


In [36]:
hot_cols = ["ProductCategory", "ProviderId", "ChannelId", "Sign", "PricingStrategy"]
# hot_cols = ["ProductCategory"]

# train["ProductCategory"].values.reshape(-1,1)

# hot_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# onehotProcessor = ColumnTransformer(transformers=[
#     ('one', hot_transformer, hot_cols)
# ])

preprocessor = Pipeline(steps = [
    ("shuffle", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    ("One hot encoding", OHTransformer(hot_cols)),
    ("Dropper", DropperTransformer()),

])

res = preprocessor.fit_transform(train)

res["ChannelId4"] = 0

SMOTE = SMOTE()
smote_X, smote_Y = SMOTE.fit_resample(res, train_Y)

res.head()

# res.to_csv("wtf.csv", index=False)






Unnamed: 0,AccountId,Value,TransactionStartTime,TransactionStartDay,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,...,ChannelId1,ChannelId2,ChannelId3,Sign0,Sign1,PricingStrategy0,PricingStrategy1,PricingStrategy2,PricingStrategy3,ChannelId4
0,4841,100,92502.0,20190101.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,4841,75,174211.0,20181124.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
2,4249,25000,110047.0,20190108.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
3,460,20000,173139.0,20190130.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
4,2231,500,112403.0,20190107.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [37]:
test = preprocessor.fit_transform(test)
ch4 = test.ChannelId4
test.drop(["ChannelId4"], axis=1, inplace=True)
test["ChannelId4"] = ch4
test.head()



Unnamed: 0,AccountId,Value,TransactionStartTime,TransactionStartDay,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,...,ChannelId1,ChannelId2,ChannelId3,Sign0,Sign1,PricingStrategy0,PricingStrategy1,PricingStrategy2,PricingStrategy3,ChannelId4
0,2441,1000,100140.0,20190213.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,3439,2000,100212.0,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,4841,50,100230.0,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2685,3000,100238.0,20190213.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,4841,60,100258.0,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
#import xgboost
from xgboost import XGBClassifier
# my_model = XGBClassifier(n_estimators=500)
# #change dtype of all collumns to float
X = smote_X.astype('float32')
Y = smote_Y.astype('float32')
# my_model.fit(X,Y)

# res_xg = my_model.predict(test.astype('float32'))


# output = pd.DataFrame()
# output["TransactionId"] = test_transaction_id
# output["FraudResult"] = res_xg

# output.to_csv("wtf.csv", index=False)



In [39]:
# output.to_csv("wtf.csv", index=False)

In [40]:
from sklearn.model_selection import GridSearchCV

model = XGBClassifier()

param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [30, 40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X, Y)

# On print les meilleurs hyperparamètres
print(grid_search.best_params_)

{'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 500}


In [44]:

model = XGBClassifier(early_stopping_rounds = 10)

# param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [ 40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}

param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,60,100,500], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}
grid_search2 = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search2.fit(X, Y)

# On print les meilleurs hyperparamètres
print(grid_search2.best_params_)

ValueError: 
All the 320 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "c:\Python310\lib\site-packages\xgboost\sklearn.py", line 1490, in fit
    self._Booster = train(
  File "c:\Python310\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "c:\Python310\lib\site-packages\xgboost\training.py", line 186, in train
    if cb_container.after_iteration(bst, i, dtrain, evals):
  File "c:\Python310\lib\site-packages\xgboost\callback.py", line 247, in after_iteration
    ret = any(c.after_iteration(model, epoch, self.history)
  File "c:\Python310\lib\site-packages\xgboost\callback.py", line 247, in <genexpr>
    ret = any(c.after_iteration(model, epoch, self.history)
  File "c:\Python310\lib\site-packages\xgboost\callback.py", line 412, in after_iteration
    assert len(evals_log.keys()) >= 1, msg
AssertionError: Must have at least 1 validation dataset for early stopping.


In [None]:
model = XGBClassifier(,early_stopping_rounds = 100)

# param_grid = {'max_depth': [2, 4, 6, 10, 12,20,40], 'n_estimators': [40, 50,60, 70,100,500], 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5,0.7]}
param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,60,100,500], 'learning_rate': [ 0.3, 0.4, 0.5,0.7]}

grid_search3 = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search3.fit(X, Y)

# On print les meilleurs hyperparamètres
print(grid_search3.best_params_)