In [24]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline as skPipeline
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from CustomTransformers import StringCleanTransformer, DayTimeTransformer, DropperTransformer, SignTransformer, OHTransformer, FloatTransformer
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold

In [25]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")

In [26]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)
train_transaction_id = train.TransactionId
train.drop(['TransactionId'], axis=1, inplace=True)

test_transaction_id = test.TransactionId
test.drop(['TransactionId'], axis=1, inplace=True)


In [27]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

def encodeOH(X, cols):
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    for elem in cols:
        OH_cols = pd.DataFrame(OH_encoder.fit_transform(X[elem].values.reshape(-1,1)))
        OH_cols.rename(columns=lambda x: elem + str(x), inplace=True)
        OH_cols.index = X.index
        X = pd.concat([X, OH_cols], axis=1)
        X.drop(elem, axis=1, inplace=True)

        print(X.columns.values)
    return X

train.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,BatchId_26638,AccountId_4841,SubscriptionId_3829,CustomerId_5238,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-2000.0,2000,2018-12-18T06:52:20Z,2
1,BatchId_13824,AccountId_4841,SubscriptionId_3829,CustomerId_1709,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-01-14T08:19:46Z,2
2,BatchId_35126,AccountId_4133,SubscriptionId_2763,CustomerId_4585,UGX,256,ProviderId_6,ProductId_3,airtime,ChannelId_3,1000.0,1000,2018-11-23T15:46:26Z,2
3,BatchId_65500,AccountId_4704,SubscriptionId_2244,CustomerId_5172,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,10000.0,10000,2018-11-30T12:22:58Z,4
4,BatchId_116190,AccountId_1555,SubscriptionId_1194,CustomerId_1929,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,50000.0,50000,2019-02-01T16:50:38Z,2


In [28]:
# hot_cols = ["ProductCategory", "ProviderId", "ChannelId", "Sign", "PricingStrategy"]
hot_cols = ["ProductCategory"]

x_train, x_valid, y_train, y_valid = train_test_split(train, train_Y, train_size=0.8, test_size=0.2,stratify=train_Y)

smt = SMOTE()

train = encodeOH(train, hot_cols)







['BatchId' 'AccountId' 'SubscriptionId' 'CustomerId' 'CurrencyCode'
 'CountryCode' 'ProviderId' 'ProductId' 'ChannelId' 'Amount' 'Value'
 'TransactionStartTime' 'PricingStrategy' 'ProductCategory0'
 'ProductCategory1' 'ProductCategory2' 'ProductCategory3'
 'ProductCategory4' 'ProductCategory5' 'ProductCategory6'
 'ProductCategory7' 'ProductCategory8']




In [29]:
preprocessor = imbPipeline(steps = [
    ("shuffle", StringCleanTransformer()),
    ("day_time_separator", DayTimeTransformer()),
    ("amout to sign", SignTransformer()),
    # ("One hot encoding", OHTransformer(hot_cols)),
    ("Dropper", DropperTransformer()),
    ("float", FloatTransformer()),
    ("smote", smt),
    ("model", XGBClassifier())
])


# processed_train= preprocessor.fit(x_train, y_train)

gen = StratifiedKFold(n_splits=5 )

cross_val_score(preprocessor, train, train_Y, scoring='f1', cv =gen )

array([0.87804878, 0.88888889, 0.95      , 0.88      , 0.91566265])

In [30]:
preprocessor.fit(train, train_Y)
test = encodeOH(test, hot_cols)
res = preprocessor.predict(test)
output = pd.DataFrame()
output["TransactionId"] = test_transaction_id
output["FraudResult"] = res

output.to_csv("wtf.csv", index=False)



['BatchId' 'AccountId' 'SubscriptionId' 'CustomerId' 'CurrencyCode'
 'CountryCode' 'ProviderId' 'ProductId' 'ChannelId' 'Amount' 'Value'
 'TransactionStartTime' 'PricingStrategy' 'ProductCategory0'
 'ProductCategory1' 'ProductCategory2' 'ProductCategory3'
 'ProductCategory4' 'ProductCategory5' 'ProductCategory6'
 'ProductCategory7' 'ProductCategory8']


In [None]:
#attention ajout channelid 4 pas dans training donc pas drop dans test plutot que rajouter