In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE, SMOTENC
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer, FloatTransformer, biningTransformer, weekdayTransformer
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

In [2]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [3]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)
train_transaction_id = train.TransactionId
train.drop(['TransactionId'], axis=1, inplace=True)

test_transaction_id = test.TransactionId
test.drop(['TransactionId'], axis=1, inplace=True)


In [4]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

def encodeOH(X, cols):
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    for elem in cols:
        OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[elem].values.reshape(-1,1)))
        OH_cols.rename(columns=lambda x: elem + str(x), inplace=True)
        OH_cols.index = X.index
        X.drop(elem, axis=1, inplace=True)
        X = pd.concat([X, OH_cols], axis=1)


        print(X.columns.values)
    return X

train_Y.head()

0    0
1    0
2    0
3    0
4    0
Name: FraudResult, dtype: int64

In [5]:
# hot_cols = ["ProductCategory", "ProviderId", "ChannelId", "Sign", "PricingStrategy"]
hot_cols = ["ProductCategory"]

drop_cols = ['CurrencyCode', 'CountryCode', 'BatchId', 'CustomerId', 'SubscriptionId', 'ProductId', 'Amount', "ProviderId", "ChannelId"]

bin_cols = ["TransactionStartTime"]

train = encodeOH(train, hot_cols)

train, x_valid, train_Y, y_valid = train_test_split(train, train_Y, train_size=0.8, test_size=0.2,stratify=train_Y)

smt = SMOTE(sampling_strategy = 0.25)
# smtenc = SMOTENC(categorical_features=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14], sampling_strategy = 0.25)




print(train_Y.value_counts())
print(train_Y.value_counts()[1] / train_Y.value_counts()[0])




['BatchId' 'AccountId' 'SubscriptionId' 'CustomerId' 'CurrencyCode'
 'CountryCode' 'ProviderId' 'ProductId' 'ChannelId' 'Amount' 'Value'
 'TransactionStartTime' 'PricingStrategy' 'ProductCategory0'
 'ProductCategory1' 'ProductCategory2' 'ProductCategory3'
 'ProductCategory4' 'ProductCategory5' 'ProductCategory6'
 'ProductCategory7' 'ProductCategory8']
0    76375
1      154
Name: FraudResult, dtype: int64
0.002016366612111293


In [6]:
preprocessor = imbPipeline(steps = [
    ("shuffle", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    # ("One hot encoding", OHTransformer(hot_cols)),
    ("Dropper", DropperTransformer(drop_cols)),
    ("binning", biningTransformer(bin_cols)),
    ("weekday", weekdayTransformer()),
    ("float", FloatTransformer()),
    # ("smote", smt),
    # ("model", XGBClassifier())
])



# processed_train= preprocessor.fit(x_train, y_train)

# gen = StratifiedKFold(n_splits=5 )

# cross_val_score(preprocessor, train, train_Y, scoring='f1', cv =gen )
train.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ChannelId,Amount,...,PricingStrategy,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,ProductCategory6,ProductCategory7,ProductCategory8
48908,BatchId_81306,AccountId_928,SubscriptionId_1673,CustomerId_1277,UGX,256,ProviderId_6,ProductId_10,ChannelId_3,1000.0,...,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53122,BatchId_72497,AccountId_3432,SubscriptionId_841,CustomerId_3867,UGX,256,ProviderId_6,ProductId_3,ChannelId_3,3000.0,...,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56650,BatchId_100659,AccountId_209,SubscriptionId_4733,CustomerId_533,UGX,256,ProviderId_1,ProductId_15,ChannelId_3,10000.0,...,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
46419,BatchId_97610,AccountId_4003,SubscriptionId_3002,CustomerId_4453,UGX,256,ProviderId_5,ProductId_15,ChannelId_3,10000.0,...,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
58829,BatchId_121320,AccountId_4184,SubscriptionId_998,CustomerId_4636,UGX,256,ProviderId_6,ProductId_13,ChannelId_3,3000.0,...,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
preprocessor.fit_transform(train, train_Y)
train = encodeOH(train, ["TransactionStartDay"])




KeyError: 'ProductCategory'

In [None]:
# preprocessor.fit_transform(train, train_Y)

# preprocessor.fit_resample(train, train_Y)
# preprocessor.fit(train, train_Y)

# train.head()
print(train.AccountId.count())
print(train_Y.count())



train, train_Y = smt.fit_resample(train, train_Y)

# train_x.head()

# print(train_y.value_counts())
print(train_Y.value_counts()[1] / train_Y.value_counts()[0])
print(train.AccountId.count())
print(train_Y.count())


test = encodeOH(test, hot_cols)
preprocessor.fit_transform(test)
test = encodeOH(test, ["TransactionStartDay"])

# test.head()
# res = preprocessor.predict(test)
train.head()
# print(train.AccountId.dtype)

train.head()

76529
76529
0.2499901800327332
95468
95468
['BatchId' 'AccountId' 'SubscriptionId' 'CustomerId' 'CurrencyCode'
 'CountryCode' 'ProviderId' 'ProductId' 'ChannelId' 'Amount' 'Value'
 'TransactionStartTime' 'PricingStrategy' 'ProductCategory0'
 'ProductCategory1' 'ProductCategory2' 'ProductCategory3'
 'ProductCategory4' 'ProductCategory5' 'ProductCategory6'
 'ProductCategory7' 'ProductCategory8']
['AccountId' 'Value' 'TransactionStartTime' 'PricingStrategy'
 'ProductCategory0' 'ProductCategory1' 'ProductCategory2'
 'ProductCategory3' 'ProductCategory4' 'ProductCategory5'
 'ProductCategory6' 'ProductCategory7' 'ProductCategory8' 'Sign'
 'TransactionStartDay0' 'TransactionStartDay1' 'TransactionStartDay2'
 'TransactionStartDay3' 'TransactionStartDay4' 'TransactionStartDay5'
 'TransactionStartDay6']


Unnamed: 0,AccountId,Value,TransactionStartTime,PricingStrategy,ProductCategory0,ProductCategory1,ProductCategory2,ProductCategory3,ProductCategory4,ProductCategory5,...,ProductCategory7,ProductCategory8,Sign,TransactionStartDay0,TransactionStartDay1,TransactionStartDay2,TransactionStartDay3,TransactionStartDay4,TransactionStartDay5,TransactionStartDay6
0,1301.0,3000,2,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,True,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,238.0,1000,0,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2558.0,1000,4,2,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,True,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4841.0,5000,3,2,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,231.0,1000,0,4,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,True,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# model  = XGBClassifier(learning_rate =0.5, max_depth= 4, n_estimators= 500, early_stopping_rounds=20, eval_metric='auc')

model  = XGBClassifier( n_estimators= 500)

preprocessor.fit_transform(x_valid)

# y_valid = encodeOH(y_valid, hot_cols)
# preprocessor.fit_transform(y_valid)

# model.fit(train.astype('float32'), train_Y, eval_set=[(x_valid.astype('float32'), y_valid)], verbose=True)
model.fit(train.astype('float32'), train_Y,verbose=True)

res = model.predict(test.astype('float32'))

output = pd.DataFrame()
output["TransactionId"] = test_transaction_id
output["FraudResult"] = res

output.to_csv("wtf.csv", index=False)

In [None]:
#attention ajout channelid 4 pas dans training donc pas drop dans test plutot que rajouter