Imports

In [52]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

Data import

In [53]:
train = pd.read_csv("data/training.csv")
test = pd.read_csv("data/test.csv")




In [54]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

    # f1_scores = cross_val_score(XGBRegressor(), X, y, scoring="f1", cv=5)
    # f1_scores = pd.Series(f1_scores, name="F1 Scores")

Exploring dataSet

In [22]:

# train.select_dtypes(["object"]).nunique()
train.head(5)
# test.select_dtypes(["object"]).nunique()
# test.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [23]:
train.nunique()
# test.nunique()

TransactionId           95662
BatchId                 94809
AccountId                3633
SubscriptionId           3627
CustomerId               3742
CurrencyCode                1
CountryCode                 1
ProviderId                  6
ProductId                  23
ProductCategory             9
ChannelId                   4
Amount                   1676
Value                    1517
TransactionStartTime    94556
PricingStrategy             4
FraudResult                 2
dtype: int64

Feature encoding and engineering

In [55]:
train = train.sample(frac=1).reset_index(drop=True)
train_Y = train.FraudResult
train.drop(['FraudResult'], axis=1, inplace=True)



features = ["BatchId","AccountId","SubscriptionId","CustomerId","ProviderId","ProductId","ProductCategory","ChannelId","Amount","Value","TransactionStartTime","PricingStrategy"]

# train.drop(['CurrencyCode','CountryCode'], axis=1, inplace=True)
# test.drop(['CurrencyCode','CountryCode'], axis=1, inplace=True)


In [56]:
def getDay(x):
    return float(''.join(x.split("T")[0].split("-")))

def getTime(x):
    time = x.split("T")[1].split(":")
    time[-1] = time[-1][:-1]
    return float(''.join(time))

In [62]:
StringToClean = ["TransactionId", "BatchId","AccountId","SubscriptionId","CustomerId", "ProviderId", "ProductId", "ChannelId", "ProductCategory"]

# train[StringToClean].apply()

first_data = train.copy()
for col in StringToClean:
    first_data[col] = first_data[col].apply(lambda x : x.split("_")[-1])

first_data["TransactionStartDay"]  = first_data["TransactionStartTime"].apply(getDay)
first_data["TransactionStartTime"] = first_data["TransactionStartTime"].apply(getTime)
first_data = first_data.set_index("TransactionId")
first_data.drop(['CurrencyCode'], axis=1, inplace=True)
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(first_data["ProductCategory"].values.reshape(-1,1)))
OH_cols_train.rename(columns=lambda x: "ProductCategory_" + str(x), inplace=True)

# One-hot encoding removed index; put it back
OH_cols_train.index = first_data.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = first_data.drop(["ProductCategory"], axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)


OH_X_train.head()
# OH_X_train["TransactionStartTime"].head()
train_clean = OH_X_train.copy()



creating different models


In [29]:
decisionTree = DecisionTreeClassifier()

randomForest = RandomForestClassifier()

multiLayerPerceptron = MLPClassifier(activation='tanh')

# decisionTree.fit(OH_X_train, train_Y)

# randomForest.fit(OH_X_train,train_Y)

# multiLayerPerceptron.fit(OH_X_train, train_Y)


In [30]:
f1_scores_mlp = cross_val_score(multiLayerPerceptron, OH_X_train, train_Y, scoring="f1", cv=5)
print(f1_scores_mlp)

[0.375      0.42857143 0.54545455 0.61261261 0.34042553]


In [None]:
f1_scores_DT = cross_val_score(decisionTree, OH_X_train, train_Y, scoring="f1", cv=5)
print(f1_scores_DT)

[0.77333333 0.93506494 0.86363636 0.94444444 0.8974359 ]


In [None]:
f1_scores_rf = cross_val_score(randomForest, OH_X_train, train_Y, scoring="f1", cv=5)
print(f1_scores_rf)

[0.84210526 0.94871795 0.91139241 0.95890411 0.92105263]


In [31]:
#smote

from imblearn.over_sampling import SMOTE

SMOTE = SMOTE()
smote_X, smote_Y = SMOTE.fit_resample(OH_X_train, train_Y)

# f1_scores_mlp = cross_val_score(multiLayerPerceptron, smote_X, smote_Y, scoring="f1", cv=5)
# print(f1_scores_mlp)

[0.98449451 0.98729212 0.9829031  0.97954422 0.98781513]


In [50]:
print(smote_Y.unique())
print(smote_Y.value_counts())
print(train_Y.value_counts())

[0 1]
0    95469
1    95469
Name: FraudResult, dtype: int64
0    95469
1      193
Name: FraudResult, dtype: int64


In [51]:
StringToClean = ["TransactionId", "BatchId","AccountId","SubscriptionId","CustomerId", "ProviderId", "ProductId", "ChannelId", "ProductCategory"]

# train[StringToClean].apply()

test_data = test.copy()
for col in StringToClean:
    test_data[col] = test_data[col].apply(lambda x : x.split("_")[-1])

test_data["TransactionStartDay"]  = test_data["TransactionStartTime"].apply(getDay)
test_data["TransactionStartTime"] = test_data["TransactionStartTime"].apply(getTime)
test_data = test_data.set_index("TransactionId")
test_data.drop(['CurrencyCode'], axis=1, inplace=True)
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(test_data["ProductCategory"].values.reshape(-1,1)))
OH_cols_train.rename(columns=lambda x: "ProductCategory_" + str(x), inplace=True)

# One-hot encoding removed index; put it back
OH_cols_train.index = test_data.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = test_data.drop(["ProductCategory"], axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)


OH_X_train.head()
# OH_X_train["TransactionStartTime"].head()



Unnamed: 0_level_0,BatchId,AccountId,SubscriptionId,CustomerId,CountryCode,ProviderId,ProductId,ChannelId,Amount,Value,...,TransactionStartDay,ProductCategory_0,ProductCategory_1,ProductCategory_2,ProductCategory_3,ProductCategory_4,ProductCategory_5,ProductCategory_6,ProductCategory_7,ProductCategory_8
TransactionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50600,35028,2441,4426,2857,256,5,3,3,1000.0,1000,...,20190213.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95109,45139,3439,2643,3874,256,5,15,3,2000.0,2000,...,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
47357,74887,4841,3829,2857,256,4,6,2,-50.0,50,...,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
28185,11025,2685,4626,3105,256,5,10,3,3000.0,3000,...,20190213.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22140,29804,4841,3829,3105,256,4,6,2,-60.0,60,...,20190213.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [33]:
multiLayerPerceptron.fit(smote_X, smote_Y)
test_res = multiLayerPerceptron.predict(OH_X_train)

In [None]:
decisionTree.fit(smote_X, smote_Y)
test_res = decisionTree.predict(OH_X_train)

In [None]:
randomForest.fit(smote_X, smote_Y)
test_res = randomForest.predict(OH_X_train)

In [36]:
#import xgboost
from xgboost import XGBClassifier
my_model = XGBClassifier(n_estimators=500)
#change dtype of all collumns to float
X = smote_X.astype('float32')
Y = smote_Y.astype('float32')
my_model.fit(X,Y)
test_res = my_model.predict(OH_X_train.astype('float32'))

In [37]:
a = my_model.predict(OH_X_train.astype('float32'))
b = multiLayerPerceptron.predict(OH_X_train)

print(np.unique(a, return_counts=True))
print(np.unique(b, return_counts=True))

(array([0, 1]), array([44936,    83], dtype=int64))
(array([0, 1], dtype=int64), array([44338,   681], dtype=int64))


In [None]:
print(test_res)

#concatenate the test result with coresponding transaction id
output = pd.DataFrame()
output["TransactionId"] = test["TransactionId"]
output["FraudResult"] = test_res
print(output.head(20))

#save the result to csv file
output.to_csv("submission.csv", index=False)

[0 0 0 ... 0 0 0]
           TransactionId  FraudResult
0    TransactionId_50600            0
1    TransactionId_95109            0
2    TransactionId_47357            0
3    TransactionId_28185            0
4    TransactionId_22140            0
5   TransactionId_134338            0
6   TransactionId_109096            0
7    TransactionId_14249            0
8    TransactionId_69896            0
9    TransactionId_91468            0
10   TransactionId_18862            0
11   TransactionId_29342            0
12  TransactionId_116873            0
13   TransactionId_81197            0
14   TransactionId_83120            0
15   TransactionId_40882            0
16   TransactionId_89297            0
17  TransactionId_112716            0
18   TransactionId_61794            0
19  TransactionId_124957            0


In [58]:
train_clean.head()

Unnamed: 0_level_0,BatchId,AccountId,SubscriptionId,CustomerId,CountryCode,ProviderId,ProductId,ChannelId,Amount,Value,...,TransactionStartDay,ProductCategory_0,ProductCategory_1,ProductCategory_2,ProductCategory_3,ProductCategory_4,ProductCategory_5,ProductCategory_6,ProductCategory_7,ProductCategory_8
TransactionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130102,33496,4841,3829,2528,256,4,6,2,-20.0,20,...,20181231.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
133204,16672,4389,4791,4846,256,6,3,3,500.0,500,...,20181126.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104728,137854,4249,4429,7343,256,4,10,2,-10000.0,10000,...,20190108.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
90921,68518,769,4636,1114,256,5,3,3,1800.0,1800,...,20181220.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56303,67763,2441,4426,2857,256,5,3,3,3000.0,3000,...,20190112.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# for colname in train_clean.select_dtypes("object"):
#     train_clean[colname], _ =train_clean[colname].factorize()

train_clean.head()
train_clean.select_dtypes("object").head()

scores = make_mi_scores(train_clean, train_Y)

In [73]:
print(scores)

BatchId                 0.112173
Value                   0.016750
CustomerId              0.010575
SubscriptionId          0.007633
AccountId               0.007132
ProductCategory_8       0.004976
ProviderId              0.003615
ProductId               0.003456
ProductCategory_0       0.003270
ChannelId               0.001853
ProductCategory_5       0.001193
TransactionStartDay     0.001054
ProductCategory_6       0.000858
TransactionStartTime    0.000628
PricingStrategy         0.000560
ProductCategory_2       0.000525
CountryCode             0.000010
ProductCategory_1       0.000000
ProductCategory_3       0.000000
ProductCategory_4       0.000000
ProductCategory_7       0.000000
Sign                    0.000000
Name: MI Scores, dtype: float64


In [70]:
import math
train_clean["Sign"] = train_clean["Amount"].apply(lambda x : x>=0)
train_clean.drop(["Amount"], axis=1, inplace=True)
train_clean.drop(["CountryCode"], axis=1, inplace=True)

In [75]:
train_clean["CountryCode"].unique()

array([256], dtype=int64)