In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE, SMOTENC
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer, FloatTransformer, biningTransformer, weekdayTransformer, TotalTransformer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

In [2]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)


StringToClean = ["TransactionId", "BatchId","AccountId","SubscriptionId","CustomerId", "ProviderId", "ProductId", "ChannelId", "ProductCategory"]

drop_cols = ["CurrencyCode", "BatchId", "CountryCode", "CustomerId", "PricingStrategy", "Amount"]
hot_cols = ["ProductCategory"]
bin_cols = ["TransactionStartTime"]
smt  = SMOTE()

In [8]:
def OH(X, hot_cols):
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    for elem in hot_cols:
        OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[elem].values.reshape(-1,1)))
        OH_cols.rename(columns=lambda x: elem + str(x), inplace=True)
        OH_cols.index = X.index
        X = pd.concat([X, OH_cols], axis=1)
    return X

drop_cols_diff = ["CurrencyCode", "BatchId", "CountryCode", "CustomerId", "PricingStrategy", "Amount", "ProductCategory"]

pipelineXGB_diff= imbPipeline(steps = [
    ("clean", StringCleanTransformer()),
    ("amout to sign", SignTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("Dropper", DropperTransformer(drop_cols_diff)),
    # ("One hot encoding", OHTransformer(hot_cols)),
    ("weekday", weekdayTransformer()),
    ("float", FloatTransformer()),
    ("smote", smt),
    ("model",MLPClassifier())
])


# param_grid = {'max_depth': [2, 4, 6,10], 'n_estimators': [ 40,300,500,600], 'learning_rate': [ 0.3,0.4,0.5,0.6]}
# param_grid = {'model__n_estimators': [10,100,500,600]}
param_grid = {'model__hidden_layer_sizes': [16,17]}


grid_search = GridSearchCV(pipelineXGB_diff, param_grid, scoring='f1')
grid_search.fit(OH(train, hot_cols), train_Y)


print(grid_search.best_params_)

# {'model__activation': 'relu', 'model__hidden_layer_sizes': 50}
{'model__hidden_layer_sizes': 16}



{'model__hidden_layer_sizes': 16}
