In [65]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE, SMOTENC
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer, FloatTransformer, biningTransformer, weekdayTransformer
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

In [66]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [67]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)


StringToClean = ["TransactionId", "BatchId","AccountId","SubscriptionId","CustomerId", "ProviderId", "ProductId", "ChannelId", "ProductCategory"]

drop_cols = ["CurrencyCode"]
hot_cols = ["ProductCategory"]
bin_cols = ["TransactionStartTime"]
smt  = SMOTE()

In [68]:
preprocessor = imbPipeline(steps = [
    ("cleanStrings", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    ("Dropper", DropperTransformer(drop_cols)),
    ("One hot encoding", OHTransformer(hot_cols)),
    # ("binning", biningTransformer(bin_cols)),
    ("weekday", weekdayTransformer()),
    ("float", FloatTransformer()),
    ("smote", smt),
    ("model", XGBClassifier(n_estimators = 500))
])

preprocessor.fit(train, train_Y)
test_res = preprocessor.predict(test)

output = pd.DataFrame()
output["TransactionId"] = test["TransactionId"]
output["FraudResult"] = test_res
print(output.head(20))

#save the result to csv file
output.to_csv("submission.csv", index=False)

['BatchId' 'AccountId' 'SubscriptionId' 'CustomerId' 'CountryCode'
 'ProviderId' 'ProductId' 'ChannelId' 'Amount' 'Value'
 'TransactionStartTime' 'PricingStrategy' 'TransactionStartDay' 'Sign'
 'ProductCategory0' 'ProductCategory1' 'ProductCategory2'
 'ProductCategory3' 'ProductCategory4' 'ProductCategory5'
 'ProductCategory6' 'ProductCategory7' 'ProductCategory8']
['BatchId' 'AccountId' 'SubscriptionId' 'CustomerId' 'CountryCode'
 'ProviderId' 'ProductId' 'ChannelId' 'Amount' 'Value'
 'TransactionStartTime' 'PricingStrategy' 'TransactionStartDay' 'Sign'
 'ProductCategory0' 'ProductCategory1' 'ProductCategory2'
 'ProductCategory3' 'ProductCategory4' 'ProductCategory5'
 'ProductCategory6' 'ProductCategory7' 'ProductCategory8']
           TransactionId  FraudResult
0    TransactionId_50600            0
1    TransactionId_95109            0
2    TransactionId_47357            0
3    TransactionId_28185            0
4    TransactionId_22140            0
5   TransactionId_134338            