In [1]:
import pandas as pd
train_df = pd.read_csv("preprocessed_train.csv")

train_df.head()

Unnamed: 0,individualnumber,category_number,hakkedis_amt,odul_amt,response,gender,city_code,dateofbirth,cardnumber,number_of_transactions,total_amount_spent
0,94230288,9000,21.0,2.0,0,E,7.0,1983.0,787714605562415,64.0,6188.54
1,4684087,9000,17.0,1.0,0,E,19.0,1951.0,9087854623560519,30.0,6756.74
2,92472145,9058,24.0,3.0,0,K,35.0,1982.0,4087494610563715,,
3,88026681,9030,22.0,2.0,0,K,,1961.0,7087024687560515,40.0,7376.14
4,98127795,9001,38.0,3.0,0,E,9.0,1984.0,9687474690567815,4.0,256.8


In [3]:
# correlation matrix

import seaborn as sns

corr = train_df.corr()
# sort by correlation to the response
corr = corr.sort_values(by="response", ascending=False)
print(corr.response)

response                  1.000000
number_of_transactions    0.170261
total_amount_spent        0.161810
odul_amt                  0.061195
hakkedis_amt              0.025207
city_code                -0.012122
dateofbirth              -0.013445
cardnumber               -0.018488
category_number          -0.025876
individualnumber         -0.105455
Name: response, dtype: float64


  corr = train_df.corr()


In [15]:
# model time and use f1 score
# logreg
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

X = train_df.drop(["response", "individualnumber", "cardnumber"], axis=1)
y = train_df.response

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# categorical features
category_cols = [
    "gender",
    "city_code",
]

# one hot encode categorical features
X_train = pd.get_dummies(X_train, columns=category_cols)
X_test = pd.get_dummies(X_test, columns=category_cols)

# fill missing values with mean
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

# find missing columns in test set
missing_cols = set(X_train.columns) - set(X_test.columns)

# add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0

# ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]


logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("logreg accuracy: ", accuracy_score(y_test, y_pred))
print("logreg f1 score: ", f1_score(y_test, y_pred))

# random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("rf accuracy: ", accuracy_score(y_test, y_pred))
print("rf f1 score: ", f1_score(y_test, y_pred))

# xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print("xgb accuracy: ", accuracy_score(y_test, y_pred))
print("xgb f1 score: ", f1_score(y_test, y_pred))

# lightgbm
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print("lgbm accuracy: ", accuracy_score(y_test, y_pred))
print("lgbm f1 score: ", f1_score(y_test, y_pred))

lgbm_pred = y_pred

# catboost
from catboost import CatBoostClassifier

cat = CatBoostClassifier()
cat.fit(X_train, y_train)
y_pred = cat.predict(X_test)
print("cat accuracy: ", accuracy_score(y_test, y_pred))
print("cat f1 score: ", f1_score(y_test, y_pred))



logreg accuracy:  0.9824628288219596
logreg f1 score:  0.04166666666666667
rf accuracy:  0.9832253145253527
rf f1 score:  0.0
xgb accuracy:  0.9817003431185666
xgb f1 score:  0.0
lgbm accuracy:  0.9832253145253527
lgbm f1 score:  0.04347826086956522
Learning rate set to 0.028109
0:	learn: 0.6357118	total: 6.13ms	remaining: 6.13s
1:	learn: 0.5820884	total: 8.84ms	remaining: 4.41s
2:	learn: 0.5339566	total: 11.1ms	remaining: 3.68s
3:	learn: 0.4906581	total: 13.2ms	remaining: 3.3s
4:	learn: 0.4478387	total: 15.9ms	remaining: 3.16s
5:	learn: 0.4108726	total: 22.4ms	remaining: 3.72s
6:	learn: 0.3805454	total: 25.2ms	remaining: 3.58s
7:	learn: 0.3532438	total: 28.6ms	remaining: 3.55s
8:	learn: 0.3229174	total: 31.4ms	remaining: 3.46s
9:	learn: 0.3002188	total: 36.7ms	remaining: 3.63s
10:	learn: 0.2762995	total: 40.4ms	remaining: 3.63s
11:	learn: 0.2563919	total: 43.1ms	remaining: 3.55s
12:	learn: 0.2409689	total: 45.2ms	remaining: 3.43s
13:	learn: 0.2264780	total: 48ms	remaining: 3.38s
14:	l

In [21]:
import numpy as np
# import ensemble classifier
from sklearn.ensemble import VotingClassifier

# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(("logreg", model1))
model2 = RandomForestClassifier()
estimators.append(("rf", model2))
model3 = XGBClassifier()
estimators.append(("xgb", model3))
model4 = LGBMClassifier()
estimators.append(("lgbm", model4))
model5 = CatBoostClassifier()
estimators.append(("cat", model5))

# create the ensemble model
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)
print("ensemble accuracy: ", accuracy_score(y_test, y_pred))
print("ensemble f1 score: ", f1_score(y_test, y_pred))


Learning rate set to 0.028109
0:	learn: 0.6357118	total: 5.52ms	remaining: 5.51s
1:	learn: 0.5820884	total: 8.68ms	remaining: 4.33s
2:	learn: 0.5339566	total: 10.8ms	remaining: 3.58s
3:	learn: 0.4906581	total: 13.6ms	remaining: 3.39s
4:	learn: 0.4478387	total: 15.6ms	remaining: 3.11s
5:	learn: 0.4108726	total: 18.6ms	remaining: 3.09s
6:	learn: 0.3805454	total: 23.6ms	remaining: 3.35s
7:	learn: 0.3532438	total: 25.7ms	remaining: 3.19s
8:	learn: 0.3229174	total: 28.9ms	remaining: 3.18s
9:	learn: 0.3002188	total: 31.1ms	remaining: 3.08s
10:	learn: 0.2762995	total: 33.2ms	remaining: 2.98s
11:	learn: 0.2563919	total: 38ms	remaining: 3.13s
12:	learn: 0.2409689	total: 40.4ms	remaining: 3.07s
13:	learn: 0.2264780	total: 42.4ms	remaining: 2.99s
14:	learn: 0.2133420	total: 44.2ms	remaining: 2.9s
15:	learn: 0.2012654	total: 45.7ms	remaining: 2.81s
16:	learn: 0.1888231	total: 48.5ms	remaining: 2.8s
17:	learn: 0.1785809	total: 53.8ms	remaining: 2.94s
18:	learn: 0.1700020	total: 56.3ms	remaining: 2.

In [24]:
test_df = pd.read_csv("preprocessed_test.csv")

X = test_df.drop(["individualnumber", "cardnumber"], axis=1)

# categorical features
category_cols = [
    "gender",
    "city_code",
]

# one hot encode categorical features
X = pd.get_dummies(X, columns=category_cols)

# fill missing values with mean
X = X.fillna(X.mean())


# find missing columns in test set
missing_cols = set(X_train.columns) - set(X.columns)

# add a missing column in test set with default value equal to 0
for c in missing_cols:
    X[c] = 0

# ensure the order of column in the test set is in the same order than in train set
X = X[X_train.columns]

lgbm_pred = lgbm.predict(X)

test_df["response"] = lgbm_pred
test_df = test_df[["individualnumber", "response"]]
test_df.to_csv("submission.csv", index=False)


#lgbm_pred