In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer

In [2]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [3]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)
train_transaction_id = train.TransactionId
train.drop(['TransactionId'], axis=1, inplace=True)

test_transaction_id = test.TransactionId
test.drop(['TransactionId'], axis=1, inplace=True)



features = ["BatchId","AccountId","SubscriptionId","CustomerId","ProviderId","ProductId","ProductCategory","ChannelId","Amount","Value","TransactionStartTime","PricingStrategy"]


In [4]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

train.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,BatchId_42014,AccountId_4841,SubscriptionId_3829,CustomerId_1096,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-01-26T13:49:06Z,2
1,BatchId_37193,AccountId_2744,SubscriptionId_1665,CustomerId_3165,UGX,256,ProviderId_6,ProductId_11,data_bundles,ChannelId_3,500.0,500,2018-12-07T07:30:48Z,2
2,BatchId_94372,AccountId_1820,SubscriptionId_1759,CustomerId_2211,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,10000.0,10000,2018-12-21T13:21:51Z,2
3,BatchId_138374,AccountId_4249,SubscriptionId_4429,CustomerId_7343,UGX,256,ProviderId_4,ProductId_10,airtime,ChannelId_2,-25000.0,25000,2019-01-08T10:45:26Z,4
4,BatchId_69805,AccountId_318,SubscriptionId_3087,CustomerId_647,UGX,256,ProviderId_1,ProductId_15,financial_services,ChannelId_3,3000.0,3000,2018-12-11T05:04:59Z,2


In [5]:
hot_cols = ["ProductCategory", "ProviderId", "ChannelId", "Sign", "PricingStrategy"]

# train["ProductCategory"].values.reshape(-1,1)

# hot_transformer = Pipeline(steps=[
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# onehotProcessor = ColumnTransformer(transformers=[
#     ('one', hot_transformer, hot_cols)
# ])

preprocessor = Pipeline(steps = [
    ("shuffle", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    ("One hot encoding", OHTransformer(hot_cols)),
    ("Dropper", DropperTransformer()),

])

res = preprocessor.fit_transform(train)

res["ChannelId4"] = 0

SMOTE = SMOTE()
smote_X, smote_Y = SMOTE.fit_resample(res, train_Y)

res.head()

# res.to_csv("wtf.csv", index=False)






Unnamed: 0,AccountId,Value,TransactionStartTime,TransactionStartDay,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,...,ChannelId1,ChannelId2,ChannelId3,Sign0,Sign1,PricingStrategy0,PricingStrategy1,PricingStrategy2,PricingStrategy3,ChannelId4
0,4841,50,134906.0,20190126.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,2744,500,73048.0,20181207.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
2,1820,10000,132151.0,20181221.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3,4249,25000,104526.0,20190108.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
4,318,3000,50459.0,20181211.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0


In [6]:
test = preprocessor.fit_transform(test)
ch4 = test.ChannelId4
test.drop(["ChannelId4"], axis=1, inplace=True)
test["ChannelId4"] = ch4
test.head()



Unnamed: 0,AccountId,Value,TransactionStartTime,TransactionStartDay,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,...,ChannelId1,ChannelId2,ChannelId3,Sign0,Sign1,PricingStrategy0,PricingStrategy1,PricingStrategy2,PricingStrategy3,ChannelId4
0,2441,1000,100140.0,20190213.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,3439,2000,100212.0,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,4841,50,100230.0,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2685,3000,100238.0,20190213.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,4841,60,100258.0,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
#import xgboost
from xgboost import XGBClassifier
my_model = XGBClassifier(n_estimators=500)
#change dtype of all collumns to float
X = smote_X.astype('float32')
Y = smote_Y.astype('float32')
my_model.fit(X,Y)

res_xg = my_model.predict(test.astype('float32'))


output = pd.DataFrame()
output["TransactionId"] = test_transaction_id
output["FraudResult"] = res_xg

output.to_csv("wtf.csv", index=False)



In [8]:
output.to_csv("wtf.csv", index=False)