In [1]:
import pandas as pd
import numpy as np
train_df = pd.read_csv("preprocessed_train.csv")
test_df = pd.read_csv("preprocessed_test.csv")

train_df.head()

Unnamed: 0,individualnumber,category_number,hakkedis_amt,odul_amt,response,gender,city_code,dateofbirth,cardnumber,number_of_transactions,total_amount_spent
0,96182791,9017,31.0,3.0,0,E,,1971.0,1887714669561615,113.0,9679.42
1,101697269,9046,85.0,8.0,0,K,42.0,1994.0,3387944681569715,2.0,30.65
2,102073044,9046,85.0,8.0,0,E,35.0,1946.0,6987834668561915,2.0,138.95
3,89946741,9035,43.0,4.0,0,K,16.0,1991.0,87164665567815,10.0,437.67
4,96747602,9036,9.0,1.0,0,K,38.0,1998.0,7587424699565515,11.0,418.05


In [2]:
# model time and use f1 score
# logreg
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

X = train_df.drop(["response", "individualnumber", "cardnumber"], axis=1)
y = train_df.response

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# categorical features
category_cols = [
    "gender",
    "city_code",
]

# one hot encode categorical features
X_train = pd.get_dummies(X_train, columns=category_cols)
X_test = pd.get_dummies(X_test, columns=category_cols)

# fill missing values with mean
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

# find missing columns in test set
missing_cols = set(X_train.columns) - set(X_test.columns)

# add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0

# ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]


logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print("logreg accuracy: ", accuracy_score(y_test, y_pred))
print("logreg f1 score: ", f1_score(y_test, y_pred))

# random forest
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("rf accuracy: ", accuracy_score(y_test, y_pred))
print("rf f1 score: ", f1_score(y_test, y_pred))

# xgboost
from xgboost import XGBClassifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
print("xgb accuracy: ", accuracy_score(y_test, y_pred))
print("xgb f1 score: ", f1_score(y_test, y_pred))

# lightgbm
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
print("lgbm accuracy: ", accuracy_score(y_test, y_pred))
print("lgbm f1 score: ", f1_score(y_test, y_pred))

lgbm_pred = y_pred

# catboost
from catboost import CatBoostClassifier

cat = CatBoostClassifier()
cat.fit(X_train, y_train)
y_pred = cat.predict(X_test)
print("cat accuracy: ", accuracy_score(y_test, y_pred))
print("cat f1 score: ", f1_score(y_test, y_pred))



logreg accuracy:  0.7380952380952381
logreg f1 score:  0.7105263157894737
rf accuracy:  0.75
rf f1 score:  0.7692307692307692
xgb accuracy:  0.6666666666666666
xgb f1 score:  0.6500000000000001
lgbm accuracy:  0.6666666666666666
lgbm f1 score:  0.6585365853658537
Learning rate set to 0.00645
0:	learn: 0.6905150	total: 50ms	remaining: 49.9s
1:	learn: 0.6878663	total: 51.6ms	remaining: 25.8s
2:	learn: 0.6856103	total: 53.3ms	remaining: 17.7s
3:	learn: 0.6832943	total: 55ms	remaining: 13.7s
4:	learn: 0.6805713	total: 56.7ms	remaining: 11.3s
5:	learn: 0.6780749	total: 58.3ms	remaining: 9.65s
6:	learn: 0.6756189	total: 60ms	remaining: 8.51s
7:	learn: 0.6726602	total: 63.5ms	remaining: 7.87s
8:	learn: 0.6702611	total: 65.6ms	remaining: 7.22s
9:	learn: 0.6680157	total: 67.2ms	remaining: 6.65s
10:	learn: 0.6646979	total: 68.7ms	remaining: 6.18s
11:	learn: 0.6625787	total: 69.9ms	remaining: 5.76s
12:	learn: 0.6595619	total: 71.5ms	remaining: 5.43s
13:	learn: 0.6575844	total: 73ms	remaining: 5.1

In [3]:

# import ensemble classifier
from sklearn.ensemble import VotingClassifier

# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(("logreg", model1))
model2 = RandomForestClassifier()
estimators.append(("rf", model2))
model3 = XGBClassifier()
estimators.append(("xgb", model3))
model4 = LGBMClassifier()
estimators.append(("lgbm", model4))
model5 = CatBoostClassifier()
estimators.append(("cat", model5))

# create the ensemble model
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, y_train)

y_pred = ensemble.predict(X_test)
print("ensemble accuracy: ", accuracy_score(y_test, y_pred))
print("ensemble f1 score: ", f1_score(y_test, y_pred))


Learning rate set to 0.00645
0:	learn: 0.6905150	total: 3.98ms	remaining: 3.97s
1:	learn: 0.6878663	total: 4.64ms	remaining: 2.31s
2:	learn: 0.6856103	total: 5.27ms	remaining: 1.75s
3:	learn: 0.6832943	total: 5.83ms	remaining: 1.45s
4:	learn: 0.6805713	total: 6.4ms	remaining: 1.27s
5:	learn: 0.6780749	total: 6.94ms	remaining: 1.15s
6:	learn: 0.6756189	total: 7.49ms	remaining: 1.06s
7:	learn: 0.6726602	total: 8.05ms	remaining: 999ms
8:	learn: 0.6702611	total: 8.6ms	remaining: 947ms
9:	learn: 0.6680157	total: 9.17ms	remaining: 908ms
10:	learn: 0.6646979	total: 9.71ms	remaining: 873ms
11:	learn: 0.6625787	total: 10.2ms	remaining: 840ms
12:	learn: 0.6595619	total: 10.9ms	remaining: 829ms
13:	learn: 0.6575844	total: 12.4ms	remaining: 874ms
14:	learn: 0.6553755	total: 13.1ms	remaining: 859ms
15:	learn: 0.6533172	total: 13.8ms	remaining: 850ms
16:	learn: 0.6508897	total: 14.6ms	remaining: 842ms
17:	learn: 0.6487142	total: 15.2ms	remaining: 831ms
18:	learn: 0.6460158	total: 15.9ms	remaining: 8

In [4]:


X = test_df.drop(["individualnumber", "cardnumber"], axis=1)

# categorical features
category_cols = [
    "gender",
    "city_code",
]

# one hot encode categorical features
X = pd.get_dummies(X, columns=category_cols)

# fill missing values with mean
X = X.fillna(X.mean())


# find missing columns in test set
missing_cols = set(X_train.columns) - set(X.columns)

# add a missing column in test set with default value equal to 0
for c in missing_cols:
    X[c] = 0

# ensure the order of column in the test set is in the same order than in train set
X = X[X_train.columns]

lgbm_pred = ensemble.predict(X)

test_df["response"] = lgbm_pred
test_df = test_df[["individualnumber", "response"]]
test_df.to_csv("submission.csv", index=False)

#lgbm_pred