In this notebook we use data at `2016-05` to predict product purchase for `2016-06`.

The best MAP@7 in private leader board is 0.03140. The worst is 0.00448.

In [60]:
import warnings
warnings.filterwarnings('ignore')

import os
import copy
import time
import pandas as pd
import numpy as np


from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import ml_metrics

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
def change_dtype_ser(ser):
    
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == np.object:
        return ser.astype("category")
    
    return ser
    

def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [3]:
class NumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, method="mean"):
        self._method = method
    
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in num_cols:
            self._impute_values[col] = df_train[col].agg(self._method)
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(val)
        
        # align columns
        df = df[self._train_cols]
        return df
    

class CatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, val="MISSING"):
        self._val = val
    
    def fit(self, df_train):
        cat_cols = df_train.select_dtypes(["object", "category", "bool"]).columns.to_list()
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in cat_cols:
            self._impute_values[col] = self._val
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].astype("object").fillna(val).astype("category")
                
        # align columns
        df = df[self._train_cols]
        return df

In [4]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, max_classes=20, to_array=False):
        self._to_array = to_array
        self._max_classes = max_classes
        
    def fit(self, train_df):
        self._cols_before = train_df.columns.to_list()
        
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        self._cat_cols = [col for col in self._cat_cols if train_df[col].nunique() <= self._max_classes]
        #print("Columns to one-hot encode:", self._cat_cols)
        df_cat = train_df[self._cat_cols]
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat, drop_first=True).columns.to_list()
        else:
            self._cat_cols_ohe = []
        
        num_cols = [col for col in train_df.columns if col not in self._cat_cols]
        self._cols_after = num_cols + self._cat_cols_ohe
        
        return self
    
    def transform(self, df):
        df = df.copy()
        
        cols_before = df.columns.to_list()
        assert set(cols_before) == set(self._cols_before), "Do not have the same columns as train before transformed"
        
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df[self._cols_after]
        
        df_cat = df[self._cat_cols]
        #print("df_cat.columns", df_cat.columns)
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop cols that are present in test_df but absent in train_df
        cols_to_drop = [col for col in df_cat.columns if col not in self._cat_cols_ohe]
        #print("cols_to_drop:", cols_to_drop)
        df_cat = df_cat.drop(cols_to_drop, axis="columns")
        
        # change to float32
        for col in df_cat.columns:
            df_cat[col] = df_cat[col].astype("float32")
        
        # if some some colums are absent in test but present in train, make them all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in self._cat_cols]
        cols_after = num_cols + df_cat.columns.to_list()
        assert set(cols_after) == set(self._cols_after), "Do not have the same columns as train after transformed"
        
        df_num = df[num_cols]
        
        df = pd.concat([df_num, df_cat], axis="columns")
        # align columns
        df = df[self._cols_after]
        self._features = df.columns.to_list()
        
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df


class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
    
    def fit(self, df_train):
        self._train_cols = df_train.columns.to_list()
        cat_cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        
        self._cat_col_idx = [i for i, col in enumerate(self._train_cols) if col in cat_cols]
        
        self._label_maps = {}
        self._missing_imputers = {}
        for col in cat_cols:
            label = df_train[col].unique()
            self._label_maps[col] = {c: n for n, c in enumerate(label)}
            
            mode_label = df_train[col].mode().iloc[0]
            self._missing_imputers[col] = self._label_maps[col][mode_label]
        
        #print("Cols to label encode:", list(self._label_maps.keys()))
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "do not have the same set of columns as train"
        
        for col, label_map in self._label_maps.items():
            df[col] = df[col].map(label_map).astype(np.float32)
            if df[col].isnull().any():
                df[col] = df[col].astype(np.float32).fillna(self._missing_imputers[col])
        
        # align columns
        df = df[self._train_cols]
        
        self._features = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df
        
    def get_cat_cols(self):
        return self._cat_col_idx

In [5]:
def check_same_cols(df1, df2):
    cols1 = df1.columns
    cols2 = df2.columns
    for c1, c2 in zip(cols1, cols2):
        if c1 != c2:
            print(c1, c2)
    return None

def col_align(df1, df2, to_array=False):
    cols1 = df1.columns.to_list()
    cols2 = df2.columns.to_list()
    assert set(cols1) == set(cols2), "df1 and df2 do not have the same set of columns"
    
    if to_array:
        return df1.values.astype(np.float32), df2[cols1].values.astype(np.float32)
    else:
        return df1, df2[cols1]

In [6]:
# mean average precision at k
def mapk(y, y_prob, k=7):
    y = y[:, np.newaxis]
    # ascending
    y_pred = np.argsort(y_prob, axis=1)
    # descending
    y_pred = y_pred[:, ::-1]
    
    return ml_metrics.mapk(y, y_pred, k=k)

In [7]:
def write_submit(y_prob, target_labels, ncodpers, filepath, k=7):
    # ascending
    y_pred = np.argsort(y_prob, axis=1)
    # descending
    y_pred = y_pred[:, ::-1]
    # cut a k
    y_pred = y_pred[:, :k]
    
    added_prods = target_labels[y_pred]
    added_prods = [" ".join(line) for line in added_prods]
    
    sub_df = pd.DataFrame(ncodpers)
    sub_df["added_products"] = added_prods
    
    sub_df.to_csv(filepath, index=False)
    return None

In [31]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def run_hyperopt(classifier,
                 params_tuned, 
                 X_train, y_train,
                 X_val, y_val,
                 num_eval,
                 metric,
                 params_fixed=None,
                 rstate=None):
    assert metric in ["map7", "acc"]
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective_map7(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        y_val_prob = classifier.predict_proba(X_val)
        map7 = mapk(y_val, y_val_prob, k=7)
        
        return {"loss": -map7, "status": STATUS_OK}
    
    def objective_acc(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        y_val_pred = classifier.predict(X_val)
        acc = accuracy_score(y_val, y_val_pred)
        
        return {"loss": -acc, "status": STATUS_OK}
    
    if metric == "map7":
        print("Use map7")
        objective = objective_map7
    else:
        print("Use acc")
        objective = objective_acc
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    
    return trials, best_params

In [9]:
INP_DIR = "data/data1_"
SUB_DIR = "data/submit_"

In [10]:
X_y_2016_04 = load_csv(os.path.join(INP_DIR, "X_y_2016_04.csv"))

X_y_2016_04.shape

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 92.58 MB
Memory usage after changing types 42.72 MB


(33005, 355)

In [11]:
X_y_2016_05 = load_csv(os.path.join(INP_DIR, "X_y_2016_05.csv"))

X_y_2016_05.shape

Memory usage before changing types 100.60 MB
Memory usage after changing types 46.42 MB


(35865, 355)

In [12]:
X_2016_06 = load_csv(os.path.join(INP_DIR, "X_2016_06.csv"))

X_2016_06.shape

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 2587.12 MB
Memory usage after changing types 1202.94 MB


(929615, 354)

In [13]:
TARGET_LABELS = np.array(X_y_2016_05["TARGET"].unique())
TARGET_LABELS

array(['ind_tjcr_fin_ult1', 'ind_recibo_ult1', 'ind_nom_pens_ult1',
       'ind_nomina_ult1', 'ind_ctop_fin_ult1', 'ind_cno_fin_ult1',
       'ind_ecue_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_cco_fin_ult1',
       'ind_valo_fin_ult1', 'ind_fond_fin_ult1', 'ind_reca_fin_ult1',
       'ind_plan_fin_ult1', 'ind_ctma_fin_ult1', 'ind_dela_fin_ult1',
       'ind_ctju_fin_ult1'], dtype=object)

In [14]:
LABEL_MAP = {l: n for n, l in enumerate(TARGET_LABELS)}
LABEL_MAP 

{'ind_tjcr_fin_ult1': 0,
 'ind_recibo_ult1': 1,
 'ind_nom_pens_ult1': 2,
 'ind_nomina_ult1': 3,
 'ind_ctop_fin_ult1': 4,
 'ind_cno_fin_ult1': 5,
 'ind_ecue_fin_ult1': 6,
 'ind_ctpp_fin_ult1': 7,
 'ind_cco_fin_ult1': 8,
 'ind_valo_fin_ult1': 9,
 'ind_fond_fin_ult1': 10,
 'ind_reca_fin_ult1': 11,
 'ind_plan_fin_ult1': 12,
 'ind_ctma_fin_ult1': 13,
 'ind_dela_fin_ult1': 14,
 'ind_ctju_fin_ult1': 15}

In [15]:
def target_label_encode(y_labels, label_map):
    y_encoded = y_labels.map(label_map)
    assert y_encoded.isnull().sum() == 0
    return np.array(y_encoded.values)

# Logistic Regression

In [16]:
X_train = X_y_2016_04.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_04["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_val = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_val.shape", X_val.shape)
y_val = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_val.shape", y_val.shape)


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_val = num_imputer.transform(X_val)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder()
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)


scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

X_train.shape, X_val.shape

X_train.shape (33005, 353)
y_train.shape (33005,)
X_val.shape (35865, 353)
y_val.shape (35865,)


((33005, 403), (35865, 403))

In [17]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [18]:
y_train_pred = lr.predict(X_train)
y_train_prob = lr.predict_proba(X_train)

y_val_pred = lr.predict(X_val)
y_val_prob = lr.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.6695
Validation acc: 0.6350
Train MAP@7: 0.8154
Validation MAP@7: 0.7968


## Predict for `2016-06`

In [19]:
X_train = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_test = X_2016_06.drop(["ncodpers"], axis=1)
print("X_test.shape", X_test.shape)
ncodpers_test = X_2016_06["ncodpers"]


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_test = num_imputer.transform(X_test)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder()
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)


scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train.shape, X_test.shape

X_train.shape (35865, 353)
y_train.shape (35865,)
X_test.shape (929615, 353)


((35865, 403), (929615, 403))

In [21]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [22]:
y_train_pred = lr.predict(X_train)
y_train_prob = lr.predict_proba(X_train)

y_test_pred = lr.predict(X_test)
y_test_prob = lr.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)

Train acc: 0.6527
Train MAP@7: 0.8107


In [26]:
# submiting this gives MAP@7 = 0.02400 for public and MAP@7 = 0.02438 for private score.
write_submit(y_test_prob, TARGET_LABELS, ncodpers_test, 
             os.path.join(SUB_DIR, "lr_d1_2016_05.csv"), k=7)

# Random Forest

## Optimizer validation MAP@7

In [28]:
X_train = X_y_2016_04.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_04["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_val = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_val.shape", X_val.shape)
y_val = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_val.shape", y_val.shape)


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_val = num_imputer.transform(X_val)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)

X_train.shape, X_val.shape

X_train.shape (33005, 353)
y_train.shape (33005,)
X_val.shape (35865, 353)
y_val.shape (35865,)


((33005, 403), (35865, 403))

In [32]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 30, 2)),
    "min_samples_split": scope.int(hp.quniform("min_samples_split", 10, 500, 20)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 5, 200, 10)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 400, 10)),
}

params_fixed = {
    "n_jobs": 20,
    "n_estimators": 500
}


num_eval = 100
metric = "map7"
rf = RandomForestClassifier()

trials, best_params = run_hyperopt(rf, params, 
                                   X_train, y_train, X_val, y_val, 
                                   num_eval, metric,
                                   params_fixed=params_fixed)
best_params

Use map7
100%|██████████| 100/100 [38:18<00:00, 22.99s/trial, best loss: -0.793821406995811]
Time elapsed: 2298.85239 s


{'max_depth': 20,
 'max_features': 170,
 'min_samples_leaf': 10,
 'min_samples_split': 80}

In [33]:
rf = RandomForestClassifier(**params_fixed, **best_params)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_train_prob = rf.predict_proba(X_train)

y_val_pred = rf.predict(X_val)
y_val_prob = rf.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.6661
Validation acc: 0.6337
Train MAP@7: 0.8134
Validation MAP@7: 0.7940


### Predict for `2016-06`

In [35]:
X_train = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_test = X_2016_06.drop(["ncodpers"], axis=1)
print("X_test.shape", X_test.shape)
ncodpers_test = X_2016_06["ncodpers"]


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_test = num_imputer.transform(X_test)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)


X_train.shape, X_test.shape

X_train.shape (35865, 353)
y_train.shape (35865,)
X_test.shape (929615, 353)


((35865, 403), (929615, 403))

In [36]:
rf = RandomForestClassifier(**params_fixed, **best_params)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, max_features=170, min_samples_leaf=10,
                       min_samples_split=80, n_estimators=500, n_jobs=20)

In [37]:
y_train_pred = rf.predict(X_train)
y_train_prob = rf.predict_proba(X_train)

y_test_pred = rf.predict(X_test)
y_test_prob = rf.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)

Train acc: 0.6490
Train MAP@7: 0.8092


In [39]:
# submiting this gives MAP@7 = 0.02418 for public and MAP@7 = 0.02447 for private score.
write_submit(y_test_prob, TARGET_LABELS, ncodpers_test, 
             os.path.join(SUB_DIR, "rf_d1_2016_05_map7.csv"), k=7)

## Optimizer validation Accuracy

In [40]:
X_train = X_y_2016_04.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_04["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_val = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_val.shape", X_val.shape)
y_val = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_val.shape", y_val.shape)


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_val = num_imputer.transform(X_val)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)

X_train.shape, X_val.shape

X_train.shape (33005, 353)
y_train.shape (33005,)
X_val.shape (35865, 353)
y_val.shape (35865,)


((33005, 403), (35865, 403))

In [41]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 30, 2)),
    "min_samples_split": scope.int(hp.quniform("min_samples_split", 10, 500, 20)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 5, 200, 10)), 
    "max_features": scope.int(hp.quniform("max_features", 10, 400, 10)),
}

params_fixed = {
    "n_jobs": 20,
    "n_estimators": 500
}


num_eval = 100
metric = "acc"
rf = RandomForestClassifier()

trials, best_params = run_hyperopt(rf, params, 
                                   X_train, y_train, X_val, y_val, 
                                   num_eval, metric,
                                   params_fixed=params_fixed)
best_params

Use acc
100%|██████████| 100/100 [44:43<00:00, 26.84s/trial, best loss: -0.633096333472745]
Time elapsed: 2683.95816 s


{'max_depth': 24,
 'max_features': 300,
 'min_samples_leaf': 10,
 'min_samples_split': 40}

In [42]:
rf = RandomForestClassifier(**params_fixed, **best_params)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_train_prob = rf.predict_proba(X_train)

y_val_pred = rf.predict(X_val)
y_val_prob = rf.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.6764
Validation acc: 0.6326
Train MAP@7: 0.8222
Validation MAP@7: 0.7927


### Predict for `2016-06`

In [43]:
X_train = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_test = X_2016_06.drop(["ncodpers"], axis=1)
print("X_test.shape", X_test.shape)
ncodpers_test = X_2016_06["ncodpers"]


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_test = num_imputer.transform(X_test)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)


X_train.shape, X_test.shape

X_train.shape (35865, 353)
y_train.shape (35865,)
X_test.shape (929615, 353)


((35865, 403), (929615, 403))

In [44]:
rf = RandomForestClassifier(**params_fixed, **best_params)
rf.fit(X_train, y_train)


y_train_pred = rf.predict(X_train)
y_train_prob = rf.predict_proba(X_train)

y_test_pred = rf.predict(X_test)
y_test_prob = rf.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)


# submiting this gives MAP@7 = 0.02459 for public and MAP@7 = 0.02428 for private score.
write_submit(y_test_prob, TARGET_LABELS, ncodpers_test, 
             os.path.join(SUB_DIR, "rf_d1_2016_05_acc.csv"), k=7)

Train acc: 0.6565
Train MAP@7: 0.8161


# XGBoost

## Optimize validation MAP@7

In [45]:
X_train = X_y_2016_04.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_04["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_val = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_val.shape", X_val.shape)
y_val = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_val.shape", y_val.shape)


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_val = num_imputer.transform(X_val)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)

X_train.shape, X_val.shape

X_train.shape (33005, 353)
y_train.shape (33005,)
X_val.shape (35865, 353)
y_val.shape (35865,)


((33005, 403), (35865, 403))

In [46]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 14, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.00001), np.log(100)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(1.)),
    #"gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "objective": "multi:softmax",
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200
metric = "map7"
xgb = XGBClassifier()
trials, best_params = run_hyperopt(xgb, params, 
                                   X_train, y_train, X_val, y_val, 
                                   num_eval, metric,
                                   params_fixed=params_fixed)
best_params

Use map7
100%|██████████| 200/200 [5:41:22<00:00, 102.41s/trial, best loss: -0.8054192640390884]  
Time elapsed: 20482.07485 s


{'colsample_bytree': 0.4773572245198071,
 'learning_rate': 0.03713275404431288,
 'max_depth': 4,
 'min_child_weight': 13,
 'reg_lambda': 0.0005825016743861352,
 'subsample': 0.8192814156358021}

In [None]:
{'colsample_bytree': 0.4773572245198071,
 'learning_rate': 0.03713275404431288,
 'max_depth': 4,
 'min_child_weight': 13,
 'reg_lambda': 0.0005825016743861352,
 'subsample': 0.8192814156358021}

In [47]:
xgb = XGBClassifier(**params_fixed, **best_params)
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)
y_train_prob = xgb.predict_proba(X_train)

y_val_pred = xgb.predict(X_val)
y_val_prob = xgb.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.6849
Validation acc: 0.6495
Train MAP@7: 0.8245
Validation MAP@7: 0.8054


### Predict for `2016-06`

In [48]:
X_train = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_test = X_2016_06.drop(["ncodpers"], axis=1)
print("X_test.shape", X_test.shape)
ncodpers_test = X_2016_06["ncodpers"]


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_test = num_imputer.transform(X_test)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)


X_train.shape, X_test.shape

X_train.shape (35865, 353)
y_train.shape (35865,)
X_test.shape (929615, 353)


((35865, 403), (929615, 403))

In [50]:
xgb = XGBClassifier(**params_fixed, **best_params)
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)
y_train_prob = xgb.predict_proba(X_train)

y_test_pred = xgb.predict(X_test)
y_test_prob = xgb.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)


# submiting this gives MAP@7 = 0.02520 for public and MAP@7 = 0.02556 for private score.
write_submit(y_test_prob, TARGET_LABELS, ncodpers_test, 
             os.path.join(SUB_DIR, "xgb_d1_2016_05_map7.csv"), k=7)

Train acc: 0.6654
Train MAP@7: 0.8180


## Optimize validation Accuracy

In [51]:
X_train = X_y_2016_04.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_04["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_val = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_val.shape", X_val.shape)
y_val = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_val.shape", y_val.shape)


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_val = num_imputer.transform(X_val)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)

X_train.shape, X_val.shape

X_train.shape (33005, 353)
y_train.shape (33005,)
X_val.shape (35865, 353)
y_val.shape (35865,)


((33005, 403), (35865, 403))

In [52]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 1, 14, 1)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.00001), np.log(100)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.001), np.log(1.)),
    #"gamma": hp.uniform("gamma", 0., 5.),
}

params_fixed = {
    "objective": "multi:softmax",
    "booster": "gbtree",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "n_estimators": 500
}

num_eval = 200
metric = "acc"
xgb = XGBClassifier()
trials, best_params = run_hyperopt(xgb, params, 
                                   X_train, y_train, X_val, y_val, 
                                   num_eval, metric,
                                   params_fixed=params_fixed)
best_params

Use acc
100%|██████████| 200/200 [8:24:05<00:00, 151.23s/trial, best loss: -0.6502997351178029]  
Time elapsed: 30245.36318 s


{'colsample_bytree': 0.848734212760469,
 'learning_rate': 0.017592291895543413,
 'max_depth': 7,
 'min_child_weight': 4,
 'reg_lambda': 0.01604385893880587,
 'subsample': 0.4298776738034783}

In [54]:
{'colsample_bytree': 0.848734212760469,
 'learning_rate': 0.017592291895543413,
 'max_depth': 7,
 'min_child_weight': 4,
 'reg_lambda': 0.01604385893880587,
 'subsample': 0.4298776738034783}

{'colsample_bytree': 0.848734212760469,
 'learning_rate': 0.017592291895543413,
 'max_depth': 7,
 'min_child_weight': 4,
 'reg_lambda': 0.01604385893880587,
 'subsample': 0.4298776738034783}

In [53]:
xgb = XGBClassifier(**params_fixed, **best_params)
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)
y_train_prob = xgb.predict_proba(X_train)

y_val_pred = xgb.predict(X_val)
y_val_prob = xgb.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.6977
Validation acc: 0.6503
Train MAP@7: 0.8334
Validation MAP@7: 0.8051


### Predict for `2016-06`

In [55]:
X_train = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_test = X_2016_06.drop(["ncodpers"], axis=1)
print("X_test.shape", X_test.shape)
ncodpers_test = X_2016_06["ncodpers"]


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_test = num_imputer.transform(X_test)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)


X_train.shape, X_test.shape

X_train.shape (35865, 353)
y_train.shape (35865,)
X_test.shape (929615, 353)


((35865, 403), (929615, 403))

In [57]:
xgb = XGBClassifier(**params_fixed, **best_params)
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)
y_train_prob = xgb.predict_proba(X_train)

y_test_pred = xgb.predict(X_test)
y_test_prob = xgb.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)


# submit this gives MAP@7 = 0.02522 for public and MAP@7 = 0.02560 for private score.
write_submit(y_test_prob, TARGET_LABELS, ncodpers_test,
             os.path.join(SUB_DIR, "xgb_d1_2016_05_acc.csv"), k=7)

Train acc: 0.6748
Train MAP@7: 0.8238


# LightGBM

In [58]:
X_train = X_y_2016_04.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_04["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_val = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_val.shape", X_val.shape)
y_val = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_val.shape", y_val.shape)


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_val = num_imputer.transform(X_val)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)

cat_col_idx = le.get_cat_cols()

X_train.shape, X_val.shape

X_train.shape (33005, 353)
y_train.shape (33005,)
X_val.shape (35865, 353)
y_val.shape (35865,)


((33005, 403), (35865, 403))

In [62]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 10, 1)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 10, 300, 5)),
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 10, 200, 10)), 
    "subsample": hp.uniform("subsample", 0.4, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 1.0),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(0.0001), np.log(1000)),
    #"reg_alpha": hp.loguniform("reg_alpha", np.log(0.001), np.log(1000)),
    "learning_rate": hp.loguniform("learning_rate", np.log(0.005), np.log(1)),
}

# categorical_feature
params_fixed = {
    "objective": "multiclass",
    "boosting_type": "gbdt",
    "device": "gpu" ,
    "n_estimators": 500,
    "categorical_feature": cat_col_idx
}

num_eval = 200

metric = "map7"
lgbm = LGBMClassifier()
trials, best_params = run_hyperopt(lgbm, params, 
                                   X_train, y_train, X_val, y_val, 
                                   num_eval, metric,
                                   params_fixed=params_fixed)
best_params

Use map7
100%|██████████| 200/200 [2:35:26<00:00, 46.63s/trial, best loss: -0.8052270750765105]  
Time elapsed: 9326.61213 s


{'colsample_bytree': 0.6007753108549476,
 'learning_rate': 0.016213998507390654,
 'max_depth': 5,
 'min_child_samples': 60,
 'num_leaves': 175,
 'reg_lambda': 0.06376226621032771,
 'subsample': 0.483251255994689}

In [63]:
lgbm = LGBMClassifier(**params_fixed, **best_params)
lgbm.fit(X_train, y_train)

y_train_pred = lgbm.predict(X_train)
y_train_prob = lgbm.predict_proba(X_train)

y_val_pred = lgbm.predict(X_val)
y_val_prob = lgbm.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.6945
Validation acc: 0.6488
Train MAP@7: 0.8325
Validation MAP@7: 0.8051


### Predict for `2016-06`

In [64]:
X_train = X_y_2016_05.drop(["ncodpers", "TARGET"], axis=1)
print("X_train.shape", X_train.shape)
y_train = target_label_encode(X_y_2016_05["TARGET"], LABEL_MAP)
print("y_train.shape", y_train.shape)

X_test = X_2016_06.drop(["ncodpers"], axis=1)
print("X_test.shape", X_test.shape)
ncodpers_test = X_2016_06["ncodpers"]


num_imputer = NumImputer()
num_imputer.fit(X_train)
X_train = num_imputer.transform(X_train)
X_test = num_imputer.transform(X_test)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder(to_array=True)
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)

cat_col_idx = le.get_cat_cols()

X_train.shape, X_test.shape

X_train.shape (35865, 353)
y_train.shape (35865,)
X_test.shape (929615, 353)


((35865, 403), (929615, 403))

In [65]:
lgbm = LGBMClassifier(**params_fixed, **best_params)
lgbm.fit(X_train, y_train)

y_train_pred = lgbm.predict(X_train)
y_train_prob = lgbm.predict_proba(X_train)

y_test_pred = lgbm.predict(X_test)
y_test_prob = lgbm.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)


# submit this gives MAP@7 = 0.02520 for public and MAP@7 = 0.02555 for private score.
write_submit(y_test_prob, TARGET_LABELS, ncodpers_test,
             os.path.join(SUB_DIR, "lgbm_d1_2016_05_map7.csv"), k=7)

Train acc: 0.6749
Train MAP@7: 0.8256
