In [24]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE, SMOTENC
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer, FloatTransformer, biningTransformer, weekdayTransformer, TotalTransformer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

In [25]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [26]:
train = train.sample(frac=1).reset_index(drop=True)
train =train.sort_values(by=['CustomerId', "TransactionStartTime"])
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)

test = test.sort_values(by=['CustomerId', "TransactionStartTime"])

StringToClean = ["TransactionId", "BatchId","AccountId","SubscriptionId","CustomerId", "ProviderId", "ProductId", "ChannelId", "ProductCategory"]

drop_cols = ["CurrencyCode", "BatchId", "CountryCode", "CustomerId", "PricingStrategy", "Amount"]
hot_cols = ["ProductCategory"]
bin_cols = ["TransactionStartTime"]
smt  = SMOTE()

In [27]:
preprocessor = imbPipeline(steps = [
    ("clean", StringCleanTransformer()),
    ("amout to sign", SignTransformer()),
    ("total", TotalTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("Dropper", DropperTransformer(drop_cols)),
    ("One hot encoding", OHTransformer(hot_cols)),
    # ("binning", biningTransformer(bin_cols)),
    ("weekday", weekdayTransformer()),
    ("float", FloatTransformer()),
    ("smote", smt),
    ("model", XGBClassifier(learning_rate=0.5, n_estimators=600, max_depth=4, early_stopping_rounds=20))
])

preprocessor.fit(train.copy(), train_Y.copy())

train.head()
test_res = preprocessor.predict(test.copy())

output = pd.DataFrame()
output["TransactionId"] = test["TransactionId"]
output["FraudResult"] = test_res
print(output.head(20))

#save the result to csv file
output.to_csv("submission.csv", index=False)



AssertionError: Must have at least 1 validation dataset for early stopping.

In [None]:
def OH(X, hot_cols):
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    for elem in hot_cols:
        OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[elem].values.reshape(-1,1)))
        OH_cols.rename(columns=lambda x: elem + str(x), inplace=True)
        OH_cols.index = X.index
        X = pd.concat([X, OH_cols], axis=1)
    return X

drop_cols_diff = ["CurrencyCode", "BatchId", "CountryCode", "CustomerId", "PricingStrategy", "Amount", "ProductCategory"]

pipelineXGB_diff= imbPipeline(steps = [
    ("clean", StringCleanTransformer()),
    ("amout to sign", SignTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("Dropper", DropperTransformer(drop_cols_diff)),
    # ("One hot encoding", OHTransformer(hot_cols)),
    ("weekday", weekdayTransformer()),
    ("float", FloatTransformer()),
    ("smote", smt),
    ("model", XGBClassifier())
])


# param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,300,500,600], 'learning_rate': [ 0.3,0.4,0.5,0.6]}
# param_grid = {'model__n_estimators': [10,100,500,600]}
param_grid = {'model__n_estimators': [550,600,650,700,800], 'model__max_depth': [2,3,4,6], 'model__learning_rate': [ 0.5,0.55,0.575, 0.6]}

if (False):
    grid_search = GridSearchCV(pipelineXGB_diff, param_grid, scoring='f1')
    grid_search.fit(OH(train, hot_cols), train_Y)


    print(grid_search.best_params_)

# {'model__learning_rate': 0.5, 'model__max_depth': 4, 'model__n_estimators': 500}
# {'model__learning_rate': 0.55, 'model__max_depth': 3, 'model__n_estimators': 600}
# {'model__learning_rate': 0.5, 'model__max_depth': 4, 'model__n_estimators': 600}