In this notebook we use data at `2016-05` to predict product purchase for `2016-06`.

The best MAP@7 in private leader board is 0.03140. The worst is 0.00448.

In [1]:
import os
import copy
import time
import pandas as pd
import numpy as np


from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

import ml_metrics

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [2]:
INP_DIR = "data/data1_"
SUB_DIR = "data/submit_"

In [3]:
def change_dtype_ser(ser):
    
    if ser.dtype == int:
        return ser.astype(np.int32)
    
    if ser.dtype == float:
        return ser.astype(np.float32)
    
    if ser.dtype == np.object:
        return ser.astype("category")
    
    return ser
    

def change_dtype_df(df):
    """
    change types of columns to reduce memory size
    :param df: dataframe
    :return df: dataframe
    """
    memory = df.memory_usage().sum() / 10**6
    print("Memory usage before changing types %0.2f MB" % memory)

    for col in df.columns:
        df[col] = change_dtype_ser(df[col])

    memory = df.memory_usage().sum() / 10 ** 6
    print("Memory usage after changing types %0.2f MB" % memory)
    return df


def load_csv(filename):
    df = pd.read_csv(filename)
    df = change_dtype_df(df)
    return df

In [4]:
X_2016_04 = load_csv(os.path.join(INP_DIR, "X_2016_04.csv"))
y_2016_04 = load_csv(os.path.join(INP_DIR, "y_2016_04.csv"))

X_2016_04 = X_2016_04.drop(["ncodpers"], axis=1)
y_2016_04 = y_2016_04.drop(["ncodpers"], axis=1)

X_2016_04.shape, y_2016_04.shape

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 87.79 MB
Memory usage after changing types 38.67 MB
Memory usage before changing types 5.36 MB
Memory usage after changing types 2.68 MB


((26791, 413), (26791, 24))

In [5]:
X_2016_04.head()

Unnamed: 0,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,indresi,...,ind_recibo_ult1_LAG12,fecha_alta_LAG12,ind_nuevo_LAG12,antiguedad_LAG12,indrel_LAG12,tiprel_1mes_LAG12,ind_actividad_cliente_LAG12,renta_LAG12,segmento_LAG12,TOTAL_PRODS_LAG12
0,ES,V,63.0,-7773,False,255.0,A,1,A,True,...,0.0,-7407.0,False,246.0,A,A,True,42831.691406,02 - PARTICULARES,1.0
1,ES,V,56.0,-7773,False,253.0,A,1,A,True,...,1.0,-7407.0,False,244.0,A,A,True,128376.242188,02 - PARTICULARES,0.0
2,ES,V,61.0,-7773,False,255.0,A,1,A,True,...,1.0,-7407.0,False,246.0,A,A,True,141979.265625,02 - PARTICULARES,0.0
3,ES,H,52.0,-7773,False,255.0,A,1,A,True,...,1.0,-7407.0,False,246.0,A,A,True,44353.171875,01 - TOP,1.0
4,ES,H,56.0,-7773,False,255.0,A,1,A,True,...,1.0,-7407.0,False,246.0,A,A,True,771167.1875,02 - PARTICULARES,0.0


In [6]:
y_2016_04.head()

Unnamed: 0,ind_recibo_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_cco_fin_ult1,ind_tjcr_fin_ult1,ind_cno_fin_ult1,ind_ecue_fin_ult1,ind_dela_fin_ult1,ind_reca_fin_ult1,ind_ctma_fin_ult1,...,ind_ctpp_fin_ult1,ind_plan_fin_ult1,ind_ctju_fin_ult1,ind_deme_fin_ult1,ind_pres_fin_ult1,ind_cder_fin_ult1,ind_hip_fin_ult1,ind_viv_fin_ult1,ind_aval_fin_ult1,ind_ahor_fin_ult1
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_2016_05 = load_csv(os.path.join(INP_DIR, "X_2016_05.csv"))
y_2016_05 = load_csv(os.path.join(INP_DIR, "y_2016_05.csv"))

X_2016_05 = X_2016_05.drop(["ncodpers"], axis=1)
y_2016_05 = y_2016_05.drop(["ncodpers"], axis=1)

X_2016_05.shape, y_2016_05.shape

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 91.48 MB
Memory usage after changing types 40.30 MB
Memory usage before changing types 5.58 MB
Memory usage after changing types 2.79 MB


((27916, 413), (27916, 24))

In [8]:
X_2016_05.head()

Unnamed: 0,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,indresi,...,ind_recibo_ult1_LAG12,fecha_alta_LAG12,ind_nuevo_LAG12,antiguedad_LAG12,indrel_LAG12,tiprel_1mes_LAG12,ind_actividad_cliente_LAG12,renta_LAG12,segmento_LAG12,TOTAL_PRODS_LAG12
0,ES,V,56.0,-7803,False,255.0,A,1,A,True,...,0.0,-7437.0,False,245.0,A,A,True,326124.90625,01 - TOP,1.0
1,ES,H,71.0,-7803,False,256.0,A,1,A,True,...,1.0,-7437.0,False,246.0,A,A,True,289565.0625,02 - PARTICULARES,1.0
2,ES,V,46.0,-7803,False,256.0,A,1,A,True,...,1.0,-7437.0,False,246.0,A,A,True,297068.125,02 - PARTICULARES,0.0
3,ES,V,63.0,-7803,False,256.0,A,1,A,True,...,0.0,-7437.0,False,246.0,A,A,True,353419.375,02 - PARTICULARES,0.0
4,ES,V,55.0,-7803,False,250.0,A,1,A,True,...,1.0,-7437.0,False,240.0,A,A,True,139070.96875,01 - TOP,0.0


In [9]:
y_2016_05.head()

Unnamed: 0,ind_recibo_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_cco_fin_ult1,ind_tjcr_fin_ult1,ind_cno_fin_ult1,ind_ecue_fin_ult1,ind_dela_fin_ult1,ind_reca_fin_ult1,ind_ctma_fin_ult1,...,ind_ctpp_fin_ult1,ind_plan_fin_ult1,ind_ctju_fin_ult1,ind_deme_fin_ult1,ind_pres_fin_ult1,ind_cder_fin_ult1,ind_hip_fin_ult1,ind_viv_fin_ult1,ind_aval_fin_ult1,ind_ahor_fin_ult1
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X_2016_06 = load_csv(os.path.join(INP_DIR, "X_2016_06.csv"))

ncodpers_test = X_2016_06["ncodpers"]
X_2016_06 = X_2016_06.drop(["ncodpers"], axis=1)

X_2016_06.shape

  if (await self.run_code(code, result,  async_=asy)):


Memory usage before changing types 3033.33 MB
Memory usage after changing types 1342.38 MB


(929615, 413)

In [11]:
X_2016_06.head()

Unnamed: 0,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,indrel_1mes,tiprel_1mes,indresi,...,ind_recibo_ult1_LAG12,fecha_alta_LAG12,ind_nuevo_LAG12,antiguedad_LAG12,indrel_LAG12,tiprel_1mes_LAG12,ind_actividad_cliente_LAG12,renta_LAG12,segmento_LAG12,TOTAL_PRODS_LAG12
0,ES,V,56.0,-7834,False,256.0,A,1,A,True,...,0.0,-7468.0,False,245.0,A,A,True,326124.90625,01 - TOP,0.0
1,ES,H,36.0,-1035,False,34.0,A,1,I,True,...,0.0,-669.0,False,23.0,A,A,False,67526.28125,02 - PARTICULARES,0.0
2,ES,V,22.0,-1035,False,34.0,A,1,A,True,...,0.0,-669.0,False,23.0,A,A,False,97689.296875,03 - UNIVERSITARIO,0.0
3,ES,H,22.0,-1035,False,34.0,A,1,I,True,...,0.0,-669.0,False,23.0,A,I,False,148402.984375,03 - UNIVERSITARIO,0.0
4,ES,H,22.0,-1035,False,34.0,A,1,I,True,...,0.0,-669.0,False,23.0,A,I,False,106885.796875,03 - UNIVERSITARIO,0.0


In [13]:
class NumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, method="mean"):
        self._method = method
    
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        
        self._impute_values = {}
        for col in num_cols:
            self._impute_values[col] = df_train[col].agg(self._method)
        return self
    
    def transform(self, df):
        df = df.copy()
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(val)
        return df
    

class CatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, val="MISSING"):
        self._val = val
    
    def fit(self, df_train):
        cat_cols = df_train.select_dtypes(["object", "category", "bool"]).columns.to_list()
        
        self._impute_values = {}
        for col in cat_cols:
            self._impute_values[col] = self._val
        return self
    
    
    def transform(self, df):
        df = df.copy()
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].astype("object").fillna(val).astype("category")
        return df

In [14]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, max_classes=20, to_array=False):
        self._to_array = to_array
        self._max_classes = max_classes
        
        
    def fit(self, train_df):
        df_cat = train_df.select_dtypes(["object", "category"])
        self._cat_cols = df_cat.columns.to_list()
        
        self._cat_cols = [col for col in self._cat_cols if train_df[col].nunique() <= self._max_classes]
        print("Columns to one-hot encode:", self._cat_cols)
        df_cat = train_df[self._cat_cols]
        
        if len(self._cat_cols) > 0:
            self._cat_cols_ohe = pd.get_dummies(df_cat, drop_first=True).columns.to_list()
        else:
            self._cat_cols_ohe = []
        return self
    
    def transform(self, df):
        df = df.copy()
        if len(self._cat_cols) == 0:
            print("No cat cols in df_train, so do nothing.")
            return df
        
        df_cat = df[self._cat_cols]
        print("df_cat.columns", df_cat.columns)
        
        # one-hot encode
        df_cat = pd.get_dummies(df_cat)
        # drop cols that are present in test_df but absent in train_df
        cols_to_drop = [col for col in df_cat.columns if col not in self._cat_cols_ohe]
        print("cols_to_drop:", cols_to_drop)
        df_cat = df_cat.drop(cols_to_drop, axis="columns")
        
        # change to float32
        for col in df_cat.columns:
            df_cat[col] = df_cat[col].astype("float32")
        
        # if some some colums are absent in test but present in train, make them all zero 
        cat_cols_ohe = df_cat.columns.to_list()
        for col in self._cat_cols_ohe:
            if col not in cat_cols_ohe:
                df_cat[col] = 0
                df_cat[col] = df_cat[col].astype(np.uint8)
        
        num_cols = [col for col in df.columns if col not in self._cat_cols]
        df_num = df[num_cols]
        
        df = pd.concat([df_num, df_cat], axis="columns")
        self._features = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df


class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, to_array=False):
        self._to_array = to_array
    
    def fit(self, df_train):
        all_cols = df_train.columns.to_list()
        cat_cols = df_train.select_dtypes(["category", "object"]).columns.to_list()
        
        self._cat_col_idx = [i for i, col in enumerate(all_cols) if col in cat_cols]
        
        self._label_maps = {}
        self._missing_imputers = {}
        for col in cat_cols:
            label = df_train[col].unique()
            self._label_maps[col] = {c: n for n, c in enumerate(label)}
            
            mode_label = df_train[col].mode().iloc[0]
            self._missing_imputers[col] = self._label_maps[col][mode_label]
        
        print("Cols to label encode:", list(self._label_maps.keys()))
        return self
    
    def transform(self, df):
        df = df.copy()
        for col, label_map in self._label_maps.items():
            df[col] = df[col].map(label_map).astype(np.float32)
            if df[col].isnull().any():
                df[col] = df[col].astype(np.float32).fillna(self._missing_imputers[col])
                
        self._features = df.columns.to_list()
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df
        
    def get_cat_cols(self):
        return self._cat_col_idx

In [15]:
# mean average precision at k
def mapk(y, y_prob, k=7):
    y = y[:, np.newaxis]
    # ascending
    y_pred = np.argsort(y_prob, axis=1)
    # descending
    y_pred = y_pred[:, ::-1]
    
    return ml_metrics.mapk(y, y_pred, k=k)

In [16]:
def write_submit(y_prob, target_labels, ncodpers, filepath, k=7):
    # ascending
    y_pred = np.argsort(y_prob, axis=1)
    # descending
    y_pred = y_pred[:, ::-1]
    # cut a k
    y_pred = y_pred[:, :k]
    
    added_prods = target_labels[y_pred]
    added_prods = [" ".join(line) for line in added_prods]
    
    sub_df = pd.DataFrame(ncodpers)
    sub_df["added_products"] = added_prods
    
    sub_df.to_csv(filepath, index=False)
    return None

In [17]:
def whole_to_int(a_dict):
    new_dict = copy.deepcopy(a_dict)
    for k, v in new_dict.items():
        if np.isclose(np.round(v), v):
            new_dict[k] = int(new_dict[k])
    return new_dict


def run_hyperopt(classifier,
                 params_tuned, 
                 X_train, y_train,
                 X_val, y_val,
                 num_eval,
                 params_fixed=None,
                 rstate=None):
    
    time_start = time.time()
    if params_fixed is None:
        params_fixed = {"n_jobs": 20, "n_estimators": 100}
    
    def objective(params):
        classifier.set_params(**params_fixed, **params)
        classifier.fit(X_train, y_train)
        
        y_val_prob = classifier.predict_proba(X_val)
        map7 = mapk(y_val, y_val_prob, k=7)
        
        return {"loss": -map7, "status": STATUS_OK}
    
    if rstate is not None:
        rstate = np.random.RandomState(rstate)
        
    trials = Trials()
    best_params = fmin(objective, 
                      params_tuned, 
                      algo=tpe.suggest, 
                      max_evals=num_eval, 
                      trials=trials,
                      rstate=rstate)
    
    best_params = whole_to_int(best_params)
    
    time_end = time.time()
    time_elapse = time_end - time_start
    print("Time elapsed: %0.5f s" % time_elapse)
    
    return trials, best_params

# Logistic Regression

In [18]:
num_imputer = NumImputer()
num_imputer.fit(X_2016_04)
X_train = num_imputer.transform(X_2016_04)
X_val = num_imputer.transform(X_2016_05)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder()
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)


scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

target_labels = y_2016_04.columns.values
y_train = np.argmax(y_2016_04.values, axis=1)
y_val = np.argmax(y_2016_05.values, axis=1)

Columns to one-hot encode: ['sexo', 'indrel', 'tiprel_1mes', 'segmento', 'ind_nuevo_LAG1', 'indrel_LAG1', 'tiprel_1mes_LAG1', 'ind_actividad_cliente_LAG1', 'segmento_LAG1', 'ind_nuevo_LAG2', 'indrel_LAG2', 'tiprel_1mes_LAG2', 'ind_actividad_cliente_LAG2', 'segmento_LAG2', 'ind_nuevo_LAG3', 'indrel_LAG3', 'tiprel_1mes_LAG3', 'ind_actividad_cliente_LAG3', 'segmento_LAG3', 'ind_nuevo_LAG4', 'indrel_LAG4', 'tiprel_1mes_LAG4', 'ind_actividad_cliente_LAG4', 'segmento_LAG4', 'ind_nuevo_LAG5', 'indrel_LAG5', 'tiprel_1mes_LAG5', 'ind_actividad_cliente_LAG5', 'segmento_LAG5', 'ind_nuevo_LAG6', 'indrel_LAG6', 'tiprel_1mes_LAG6', 'ind_actividad_cliente_LAG6', 'segmento_LAG6', 'ind_nuevo_LAG7', 'indrel_LAG7', 'tiprel_1mes_LAG7', 'ind_actividad_cliente_LAG7', 'segmento_LAG7', 'ind_nuevo_LAG8', 'indrel_LAG8', 'tiprel_1mes_LAG8', 'ind_actividad_cliente_LAG8', 'segmento_LAG8', 'ind_nuevo_LAG9', 'indrel_LAG9', 'tiprel_1mes_LAG9', 'ind_actividad_cliente_LAG9', 'segmento_LAG9', 'ind_nuevo_LAG10', 'indrel_

In [19]:
X_train.shape, X_val.shape

((26791, 507), (27916, 507))

In [20]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [21]:
y_train_pred = lr.predict(X_train)
y_train_prob = lr.predict_proba(X_train)

y_val_pred = lr.predict(X_val)
y_val_prob = lr.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.7730
Validation acc: 0.2093
Train MAP@7: 0.8674
Validation MAP@7: 0.6841


## Predict for `2016-06`

In [22]:
num_imputer = NumImputer()
num_imputer.fit(X_2016_05)
X_train = num_imputer.transform(X_2016_05)
X_test = num_imputer.transform(X_2016_06)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder()
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)


scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

target_labels = y_2016_05.columns.values
y_train = np.argmax(y_2016_05.values, axis=1)

Columns to one-hot encode: ['sexo', 'indrel', 'tiprel_1mes', 'segmento', 'ind_nuevo_LAG1', 'indrel_LAG1', 'tiprel_1mes_LAG1', 'ind_actividad_cliente_LAG1', 'segmento_LAG1', 'ind_nuevo_LAG2', 'indrel_LAG2', 'tiprel_1mes_LAG2', 'ind_actividad_cliente_LAG2', 'segmento_LAG2', 'ind_nuevo_LAG3', 'indrel_LAG3', 'tiprel_1mes_LAG3', 'ind_actividad_cliente_LAG3', 'segmento_LAG3', 'ind_nuevo_LAG4', 'indrel_LAG4', 'tiprel_1mes_LAG4', 'ind_actividad_cliente_LAG4', 'segmento_LAG4', 'ind_nuevo_LAG5', 'indrel_LAG5', 'tiprel_1mes_LAG5', 'ind_actividad_cliente_LAG5', 'segmento_LAG5', 'ind_nuevo_LAG6', 'indrel_LAG6', 'tiprel_1mes_LAG6', 'ind_actividad_cliente_LAG6', 'segmento_LAG6', 'ind_nuevo_LAG7', 'indrel_LAG7', 'tiprel_1mes_LAG7', 'ind_actividad_cliente_LAG7', 'segmento_LAG7', 'ind_nuevo_LAG8', 'indrel_LAG8', 'tiprel_1mes_LAG8', 'ind_actividad_cliente_LAG8', 'segmento_LAG8', 'ind_nuevo_LAG9', 'indrel_LAG9', 'tiprel_1mes_LAG9', 'ind_actividad_cliente_LAG9', 'segmento_LAG9', 'ind_nuevo_LAG10', 'indrel_

In [23]:
X_train.shape, X_test.shape

((27916, 507), (929615, 507))

In [24]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [25]:
y_train_pred = lr.predict(X_train)
y_train_prob = lr.predict_proba(X_train)

y_test_pred = lr.predict(X_test)
y_test_prob = lr.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)

Train acc: 0.7662
Train MAP@7: 0.8668


In [26]:
# submit this gives MAP@7 = 0.01855 for public and MAP@7 = 0.01859 for private score.
write_submit(y_test_prob, target_labels, ncodpers_test, 
             os.path.join(SUB_DIR, "lr_2016_05.csv"), k=7)

# Random Forest

In [27]:
num_imputer = NumImputer()
num_imputer.fit(X_2016_04)
X_train = num_imputer.transform(X_2016_04)
X_val = num_imputer.transform(X_2016_05)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_val = cat_imputer.transform(X_val)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_val = ohe.transform(X_val)


le = LabelEncoder()
le.fit(X_train)
X_train = le.transform(X_train)
X_val = le.transform(X_val)

target_labels = y_2016_04.columns.values
y_train = np.argmax(y_2016_04.values, axis=1)
y_val = np.argmax(y_2016_05.values, axis=1)

Columns to one-hot encode: ['sexo', 'indrel', 'tiprel_1mes', 'segmento', 'ind_nuevo_LAG1', 'indrel_LAG1', 'tiprel_1mes_LAG1', 'ind_actividad_cliente_LAG1', 'segmento_LAG1', 'ind_nuevo_LAG2', 'indrel_LAG2', 'tiprel_1mes_LAG2', 'ind_actividad_cliente_LAG2', 'segmento_LAG2', 'ind_nuevo_LAG3', 'indrel_LAG3', 'tiprel_1mes_LAG3', 'ind_actividad_cliente_LAG3', 'segmento_LAG3', 'ind_nuevo_LAG4', 'indrel_LAG4', 'tiprel_1mes_LAG4', 'ind_actividad_cliente_LAG4', 'segmento_LAG4', 'ind_nuevo_LAG5', 'indrel_LAG5', 'tiprel_1mes_LAG5', 'ind_actividad_cliente_LAG5', 'segmento_LAG5', 'ind_nuevo_LAG6', 'indrel_LAG6', 'tiprel_1mes_LAG6', 'ind_actividad_cliente_LAG6', 'segmento_LAG6', 'ind_nuevo_LAG7', 'indrel_LAG7', 'tiprel_1mes_LAG7', 'ind_actividad_cliente_LAG7', 'segmento_LAG7', 'ind_nuevo_LAG8', 'indrel_LAG8', 'tiprel_1mes_LAG8', 'ind_actividad_cliente_LAG8', 'segmento_LAG8', 'ind_nuevo_LAG9', 'indrel_LAG9', 'tiprel_1mes_LAG9', 'ind_actividad_cliente_LAG9', 'segmento_LAG9', 'ind_nuevo_LAG10', 'indrel_

In [28]:
params = {
    "max_depth": scope.int(hp.quniform("max_depth", 2, 20, 1)),
    #"min_samples_split": scope.int(hp.quniform("min_samples_split", 20, 400, 10)),
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 20, 200, 10)), 
    "max_features": scope.int(hp.quniform("max_features", 5, 100, 5)),
}

params_fixed = {
    "n_jobs": 16,
    "n_estimators": 100
}


num_eval = 100
rf = RandomForestClassifier()
trials, best_params = run_hyperopt(rf, params, 
                                   X_train, y_train, X_val, y_val, 
                                   num_eval,
                                   params_fixed=params_fixed)
best_params

100%|██████████| 100/100 [09:20<00:00,  5.60s/trial, best loss: -0.8235821409807653]
Time elapsed: 560.90793 s


{'max_depth': 18, 'max_features': 90, 'min_samples_leaf': 20}

In [29]:
rf = RandomForestClassifier(**params_fixed, **best_params)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=18, max_features=90, min_samples_leaf=20,
                       n_jobs=16)

In [30]:
y_train_pred = rf.predict(X_train)
y_train_prob = rf.predict_proba(X_train)

y_val_pred = rf.predict(X_val)
y_val_prob = rf.predict_proba(X_val)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)
acc_val = accuracy_score(y_val, y_val_pred)
print("Validation acc: %0.4f" %acc_val)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)
map7_val = mapk(y_val, y_val_prob, k=7)
print("Validation MAP@7: %0.4f" %map7_val)

Train acc: 0.7630
Validation acc: 0.7100
Train MAP@7: 0.8601
Validation MAP@7: 0.8243


# Predict for `2016-06`

In [31]:
num_imputer = NumImputer()
num_imputer.fit(X_2016_05)
X_train = num_imputer.transform(X_2016_05)
X_test = num_imputer.transform(X_2016_06)


cat_imputer = CatImputer()
cat_imputer.fit(X_train)
X_train = cat_imputer.transform(X_train)
X_test = cat_imputer.transform(X_test)


ohe = OneHotEncoder()
ohe.fit(X_train)
X_train = ohe.transform(X_train)
X_test = ohe.transform(X_test)


le = LabelEncoder()
le.fit(X_train)
X_train = le.transform(X_train)
X_test = le.transform(X_test)


target_labels = y_2016_05.columns.values
y_train = np.argmax(y_2016_05.values, axis=1)

Columns to one-hot encode: ['sexo', 'indrel', 'tiprel_1mes', 'segmento', 'ind_nuevo_LAG1', 'indrel_LAG1', 'tiprel_1mes_LAG1', 'ind_actividad_cliente_LAG1', 'segmento_LAG1', 'ind_nuevo_LAG2', 'indrel_LAG2', 'tiprel_1mes_LAG2', 'ind_actividad_cliente_LAG2', 'segmento_LAG2', 'ind_nuevo_LAG3', 'indrel_LAG3', 'tiprel_1mes_LAG3', 'ind_actividad_cliente_LAG3', 'segmento_LAG3', 'ind_nuevo_LAG4', 'indrel_LAG4', 'tiprel_1mes_LAG4', 'ind_actividad_cliente_LAG4', 'segmento_LAG4', 'ind_nuevo_LAG5', 'indrel_LAG5', 'tiprel_1mes_LAG5', 'ind_actividad_cliente_LAG5', 'segmento_LAG5', 'ind_nuevo_LAG6', 'indrel_LAG6', 'tiprel_1mes_LAG6', 'ind_actividad_cliente_LAG6', 'segmento_LAG6', 'ind_nuevo_LAG7', 'indrel_LAG7', 'tiprel_1mes_LAG7', 'ind_actividad_cliente_LAG7', 'segmento_LAG7', 'ind_nuevo_LAG8', 'indrel_LAG8', 'tiprel_1mes_LAG8', 'ind_actividad_cliente_LAG8', 'segmento_LAG8', 'ind_nuevo_LAG9', 'indrel_LAG9', 'tiprel_1mes_LAG9', 'ind_actividad_cliente_LAG9', 'segmento_LAG9', 'ind_nuevo_LAG10', 'indrel_

In [32]:
rf = RandomForestClassifier(**params_fixed, **best_params)
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=18, max_features=90, min_samples_leaf=20,
                       n_jobs=16)

In [33]:
y_train_pred = rf.predict(X_train)
y_train_prob = rf.predict_proba(X_train)

y_test_pred = rf.predict(X_test)
y_test_prob = rf.predict_proba(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
print("Train acc: %0.4f" %acc_train)

map7_train = mapk(y_train, y_train_prob, k=7)
print("Train MAP@7: %0.4f" %map7_train)

Train acc: 0.7725
Train MAP@7: 0.8673


In [34]:
# submit this gives MAP@7 = 0.02387 for public and MAP@7 = 0.02423 for private score.
write_submit(y_test_prob, target_labels, ncodpers_test, 
             os.path.join(SUB_DIR, "rf_2016_05.csv"), k=7)