In [1]:
# Import libraries
import os
import warnings
import numpy as np
import pandas as pd

import gc  # Garbage collector


warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from xgboost import XGBClassifier

In [3]:
# load the model
import joblib
xgb_classifier = joblib.load("xgb_classifier_v1.h5")

In [4]:
oe = joblib.load("oe.h5")


In [5]:
test = pd.read_feather('test_data.ftr')
test.shape

(11363762, 190)

In [6]:
test.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631348,0.001912,0.010727,0.814453,0.007545,0.168701,0.009972,0.002348,...,,,,,0.004669,,,,0.008278,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.586914,0.005276,0.011024,0.811035,0.001817,0.241333,0.000166,0.009132,...,,,,0.000142,0.00494,0.009018,,0.003695,0.003754,0.00146
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.608887,0.003326,0.016388,1.004883,0.000114,0.26709,0.004196,0.004192,...,,,,7.4e-05,0.002113,0.004658,,0.003155,0.002155,0.006481
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614746,0.009064,0.021667,0.816406,0.00972,0.188965,0.004124,0.015327,...,,,,0.004742,0.006393,0.00289,,0.006042,0.005207,0.007858
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591797,0.23877,0.01593,0.810547,0.002026,0.180054,0.000731,0.011284,...,,,,0.008133,0.00433,0.008385,,0.001008,0.00742,0.009468


In [7]:
test["S_2_day"] = test["S_2"].dt.day
test["S_2_month"] = test["S_2"].dt.month
test["S_2_year"] = test["S_2"].dt.year

In [8]:
test.shape

(11363762, 193)

In [9]:


# converting pandas "categorical" dtype to numeric
cols = ["D_68", "B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126"]
test[cols] = test[cols].apply(pd.to_numeric, errors='coerce')

In [10]:
drop_cols = ['D_87', 'D_88', 'D_108', 'D_111', 'D_110', 'B_39', 'D_73', 'B_42', 'D_136',
 'D_138', 'D_137', 'D_135', 'D_134', 'R_9', 'B_29', 'D_106', 'D_132', 'D_49',
 'R_26', 'D_76', 'D_66', 'D_42', 'D_142', 'D_53', 'D_82','S_2']

test.drop(columns = drop_cols,axis=1, inplace=True)

In [11]:
categorical_columns = ["D_63","D_64"]
test_enc = oe.transform(test[categorical_columns])
test[categorical_columns] = test_enc

In [12]:
test.shape

(11363762, 167)

In [13]:
_ = gc.collect()

In [14]:
# considering only one data point per customer (latest one) as time series is not being used
test = test.groupby(['customer_ID']).nth(-1).reset_index(drop=True)

In [15]:
test=test.drop("customer_ID",axis=1)
test.shape


(924621, 166)

In [16]:
test.isna().sum()

P_2          4784
D_39            0
B_1             0
B_2            43
R_1             0
             ... 
D_144           0
D_145        5050
S_2_day         0
S_2_month       0
S_2_year        0
Length: 166, dtype: int64

In [17]:
# For numeric columns
numeric_columns = test.select_dtypes(np.number).columns
test[numeric_columns] = test[numeric_columns].fillna(test[numeric_columns].mean())

In [18]:
test.isna().sum()

P_2          4784
D_39            0
B_1             0
B_2            43
R_1             0
             ... 
D_144           0
D_145           0
S_2_day         0
S_2_month       0
S_2_year        0
Length: 166, dtype: int64

In [19]:
test.shape 

(924621, 166)

In [20]:
# Score up the test dataset
test_preds = xgb_classifier.predict(test)
# test_preds.view()

In [22]:
# Make submission
sub_data = pd.read_csv('sample_submission.csv')
sub_data.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0


In [23]:
sub_data.shape

(924621, 2)

In [24]:
sub_data['prediction'] = test_preds
sub_data.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,1


In [25]:
# Submission file
sub_data.to_csv('submission.csv', index=False)