In [104]:
import numpy as np
import pandas as pd 
import math
import pdb
import matplotlib.pyplot as plt
import xgboost as xgb
import tqdm
import pickle
import utils as utils

%matplotlib inline

In [None]:
df_train = pd.read_csv('../data/train_ver2.csv')
df_test = pd.read_csv('../data/test_ver2.csv')

In [201]:
products = [
    "ind_ahor_fin_ult1",
    "ind_aval_fin_ult1",
    "ind_cco_fin_ult1" ,
    "ind_cder_fin_ult1",
    "ind_cno_fin_ult1" ,
    "ind_ctju_fin_ult1",
    "ind_ctma_fin_ult1",
    "ind_ctop_fin_ult1",
    "ind_ctpp_fin_ult1",
    "ind_deco_fin_ult1",
    "ind_deme_fin_ult1",
    "ind_dela_fin_ult1",
    "ind_ecue_fin_ult1",
    "ind_fond_fin_ult1",
    "ind_hip_fin_ult1" ,
    "ind_plan_fin_ult1",
    "ind_pres_fin_ult1",
    "ind_reca_fin_ult1",
    "ind_tjcr_fin_ult1",
    "ind_valo_fin_ult1",
    "ind_viv_fin_ult1" ,
    "ind_nomina_ult1"  ,
    "ind_nom_pens_ult1",
    "ind_recibo_ult1"  ,
]

In [210]:
current_products = products[2:]

# Cleaning

In [None]:
for col in df_train:
    if df_train[col].dtypes == object:
        df_train[col] = df_train[col].str.strip()
        
for col in df_test:
    if df_test[col].dtypes == object:
        df_test[col] = df_test[col].str.strip()

In [None]:
for prod in products:
    df_train[prod] = df_train[prod].fillna(0).astype(np.int8)
    df_test[prod] = 0
    df_test[prod] = df_test[prod].astype(np.int8)

In [None]:
df_train[products].isnull().any()

# Feature Engineering

## Lag features

In [21]:
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")] # "2016-05-28"
    int_date = (int(Y) - 2015) * 12 + int(M)
    assert 1 <= int_date <= 12 + 6
    return int_date

In [8]:
df_train["int_date"] = df_train["fecha_dato"].map(date_to_int).astype(np.int8)
df_test["int_date"] = df_test["fecha_dato"].map(date_to_int).astype(np.int8)

In [9]:
def make_prev_df(train_df, step):
    prev_df = pd.DataFrame()
    prev_df["ncodpers"] = train_df["ncodpers"]
    prev_df["int_date"] = train_df["int_date"].map(lambda x: x+step).astype(np.int8)
    prod_features = ["%s_prev%s" % (prod, step) for prod in products]
    for prod, prev in zip(products, prod_features):
        prev_df[prev] = train_df[prod]
    return prev_df, prod_features

In [None]:
prev_dfs = []
prod_features = []
for step in range(1,6):
    print(step)
    prev1_train_df, prod1_features = make_prev_df(df_train, step)
    prev_dfs.append(prev1_train_df)
    prod_features.append(prod1_features)
    if step == 1:
        prod_features = prod1_features

In [11]:
prev_dfs[1].head()

Unnamed: 0,ncodpers,int_date,ind_ahor_fin_ult1_prev2,ind_aval_fin_ult1_prev2,ind_cco_fin_ult1_prev2,ind_cder_fin_ult1_prev2,ind_cno_fin_ult1_prev2,ind_ctju_fin_ult1_prev2,ind_ctma_fin_ult1_prev2,ind_ctop_fin_ult1_prev2,...,ind_hip_fin_ult1_prev2,ind_plan_fin_ult1_prev2,ind_pres_fin_ult1_prev2,ind_reca_fin_ult1_prev2,ind_tjcr_fin_ult1_prev2,ind_valo_fin_ult1_prev2,ind_viv_fin_ult1_prev2,ind_nomina_ult1_prev2,ind_nom_pens_ult1_prev2,ind_recibo_ult1_prev2
0,1375586,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1050611,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1050612,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1050613,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1050614,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
def join_with_prev(df, prev_df, how):
    print("before join", len(df))
    df = df.merge(prev_df, on=["ncodpers", "int_date"], how=how)
    for f in set(prev_df.columns.values.tolist()) - set(["ncodpers", "int_date"]):
        df[f] = df[f].astype(np.float16)
    print("after join", len(df))
    return df

In [13]:
for i, prev_df in enumerate(prev_dfs):
#     how = "inner" if i == 0 else "left"
    how = 'left'
    df_train = join_with_prev(df_train, prev_df, how=how)

before join 13647309
after join 13647309
before join 13647309
after join 13647309
before join 13647309
after join 13647309
before join 13647309
after join 13647309
before join 13647309
after join 13647309


In [14]:
for i, prev_df in enumerate(prev_dfs):
    how = "inner" if i == 0 else "left"
    df_test = join_with_prev(df_test, prev_df, how=how)

before join 929615
after join 929615
before join 929615
after join 929615
before join 929615
after join 929615
before join 929615
after join 929615
before join 929615
after join 929615


In [None]:
df_train.to_hdf('../data/train_cleaned2.h5', 'df')
df_test.to_hdf('../data/test_cleaned2.h5', 'df')

In [3]:
df_train = pd.read_hdf('../data/train_cleaned2.h5', 'df')
df_test = pd.read_hdf('../data/test_cleaned2.h5', 'df')

## canal_entrada (Label)

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
name = 'canal_entrada'
transformer = LabelEncoder()

df_train[name] = df_train[name].astype('str')
df_train[name] = transformer.fit_transform(df_train[name])
df_test[name] = df_test[name].astype('str')
df_test[name] = transformer.transform(df_test[name])

In [6]:
np.array(sorted(pd.unique(df_train['canal_entrada'])))

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162])

## pais_residencia

In [7]:
name = 'pais_residencia'
transformer = LabelEncoder()

df_train[name] = df_train[name].astype('str')
df_train[name] = transformer.fit_transform(df_train[name])
df_test[name] = df_test[name].astype('str')
df_test[name] = transformer.transform(df_test[name])

## Others

In [8]:
df_train["age"] = df_train["age"].replace('NA', np.nan)
len(df_train[df_train["age"] == 'NA'])

0

In [9]:
df_train["age"] = pd.to_numeric(df_train["age"], errors='coerce')

In [10]:
df_train["age"] = df_train["age"].fillna(0.0).astype(np.int16)
df_test["age"] = df_test["age"].fillna(0.0).astype(np.int16)

In [11]:
df_train["tipodom"] = df_train["tipodom"].fillna(0.0).astype(np.int8)
df_test["tipodom"] = df_test["tipodom"].fillna(0.0).astype(np.int8)

In [12]:
df_train["cod_prov"] = df_train["cod_prov"].fillna(0.0).astype(np.int8)
df_test["cod_prov"] = df_test["cod_prov"].fillna(0.0).astype(np.int8)

In [13]:
df_train["antiguedad"] = df_train["antiguedad"].replace('NA', np.nan)
len(df_train[df_train["antiguedad"] == 'NA'])

0

In [14]:
df_train["antiguedad"] = pd.to_numeric(df_train["antiguedad"], errors='coerce')

In [15]:
onehot = lambda x: 0.0 if x < 0 or math.isnan(x) else x+1.0

df_train["antiguedad"] = df_train["antiguedad"].map(onehot).astype(np.int16)
df_test["antiguedad"] = df_test["antiguedad"].map(onehot).astype(np.int16)

In [16]:
df_test["renta"]   = pd.to_numeric(df_test["renta"], errors="coerce")
unique_prov = df_test[df_test.cod_prov.notnull()].cod_prov.unique()
grouped = df_test.groupby("cod_prov")["renta"].median()

def impute_renta(df):
    df["renta"]   = pd.to_numeric(df["renta"], errors="coerce")       
    for cod in unique_prov:
        df.loc[df['cod_prov']==cod,['renta']] = df.loc[df['cod_prov']==cod,['renta']].fillna({'renta':grouped[cod]}).values
    df.renta.fillna(df_test["renta"].median(), inplace=True)
    
impute_renta(df_train)
impute_renta(df_test)

In [17]:
f_dato_m = lambda x: int(x.split("-")[1])
f_dato_y = lambda x: float(x.split("-")[0])

df_train["fecha_dato_month"] = df_train["fecha_dato"].map(f_dato_m).astype(np.int8)
df_train["fecha_dato_year"] = df_train["fecha_dato"].map(f_dato_y).astype(np.int16)

df_test["fecha_dato_month"] = df_test["fecha_dato"].map(f_dato_m).astype(np.int8)
df_test["fecha_dato_year"] = df_test["fecha_dato"].map(f_dato_y).astype(np.int16)

In [18]:
f_alta_m = lambda x: 0.0 if x.__class__ is float else float(x.split("-")[1])
f_alta_y = lambda x: 0.0 if x.__class__ is float else float(x.split("-")[0])

df_train["fecha_alta_month"] = df_train["fecha_alta"].map(f_alta_m).astype(np.int8)
df_train["fecha_alta_year"] = df_train["fecha_alta"].map(f_alta_y).astype(np.int16)

df_test["fecha_alta_month"] = df_test["fecha_alta"].map(f_alta_m).astype(np.int8)
df_test["fecha_alta_year"] = df_test["fecha_alta"].map(f_alta_y).astype(np.int16)

In [19]:
# "2016-05-28" or "" or nan
def date_to_float(str_date):
    if str_date.__class__ is float and math.isnan(str_date) or str_date == "":
        return np.nan
    Y, M, D = [int(a) for a in str_date.strip().split("-")]
    float_date = float(Y) * 12 + float(M)
    return float_date

In [23]:
df_train["fecha_dato_float"] = df_train["fecha_dato"].map(date_to_float)
df_train["fecha_alta_float"] = df_train["fecha_alta"].map(date_to_float)
df_train["dato_minus_alta"] = df_train["fecha_dato_float"] - df_train["fecha_alta_float"]

df_test["fecha_dato_float"] = df_test["fecha_dato"].map(date_to_float)
df_test["fecha_alta_float"] = df_test["fecha_alta"].map(date_to_float)
df_test["dato_minus_alta"] = df_test["fecha_dato_float"] - df_test["fecha_alta_float"]

In [24]:
def custom_one_hot(df, name, names, dtype=np.int8, check=False):
    for n, val in names.items():
        new_name = "%s_%s" % (name, n)
        print(name, new_name)
        df[new_name] = df[name].map(lambda x: 1 if x == val else 0).astype(dtype)

In [25]:
onehot_columns = {}
onehot_columns["indresi"] = {"n":"N"}
onehot_columns["indext"] = {"s":"S"}
onehot_columns["conyuemp"] = {"n":"N"}
onehot_columns["sexo"] = {"h":"H", "v":"V"}
onehot_columns["ind_empleado"] = {"a":"A", "b":"B", "f":"F", "n":"N"}
onehot_columns["ind_nuevo"] = {"new":1}
onehot_columns["segmento"] = {"top":"01 - TOP", "particulares":"02 - PARTICULARES", "universitario":"03 - UNIVERSITARIO"}
onehot_columns["indfall"] = {"s":"S"}
onehot_columns["indrel"] = {"1":1, "99":99}
onehot_columns["tiprel_1mes"] = {"a":"A", "i":"I", "p":"P", "r":"R"}

In [26]:
multi_custom_columns = ["indresi", "indext", "conyuemp", "sexo", "ind_empleado", "ind_nuevo", "segmento", 
 "indfall", "indrel", "tiprel_1mes"]

In [27]:
def multi_custom_one_hot(df):
    for col in multi_custom_columns:
        custom_one_hot(df, col, onehot_columns[col])

In [28]:
multi_custom_one_hot(df_train)
multi_custom_one_hot(df_test)

indresi indresi_n
indext indext_s
conyuemp conyuemp_n
sexo sexo_h
sexo sexo_v
ind_empleado ind_empleado_a
ind_empleado ind_empleado_b
ind_empleado ind_empleado_f
ind_empleado ind_empleado_n
ind_nuevo ind_nuevo_new
segmento segmento_top
segmento segmento_particulares
segmento segmento_universitario
indfall indfall_s
indrel indrel_1
indrel indrel_99
tiprel_1mes tiprel_1mes_a
tiprel_1mes tiprel_1mes_i
tiprel_1mes tiprel_1mes_p
tiprel_1mes tiprel_1mes_r
indresi indresi_n
indext indext_s
conyuemp conyuemp_n
sexo sexo_h
sexo sexo_v
ind_empleado ind_empleado_a
ind_empleado ind_empleado_b
ind_empleado ind_empleado_f
ind_empleado ind_empleado_n
ind_nuevo ind_nuevo_new
segmento segmento_top
segmento segmento_particulares
segmento segmento_universitario
indfall indfall_s
indrel indrel_1
indrel indrel_99
tiprel_1mes tiprel_1mes_a
tiprel_1mes tiprel_1mes_i
tiprel_1mes tiprel_1mes_p
tiprel_1mes tiprel_1mes_r


In [29]:
fill_ind_actividad_cliente = lambda x: 0.0 if math.isnan(x) else x+1.0

df_train["ind_actividad_cliente"] = df_train["ind_actividad_cliente"].map(fill_ind_actividad_cliente).astype(np.int8)
df_test["ind_actividad_cliente"] = df_test["ind_actividad_cliente"].map(fill_ind_actividad_cliente).astype(np.int8)

In [30]:
fill_indrel = lambda x: 5.0 if x == "P" else x

df_train["indrel_1mes"] = df_train["indrel_1mes"].map(fill_indrel).astype(float).fillna(0.0).astype(np.int8)
df_test["indrel_1mes"] = df_test["indrel_1mes"].map(fill_indrel).astype(float).fillna(0.0).astype(np.int8)

In [32]:
df_train.to_hdf('../data/train_featured2.h5', 'df')
df_test.to_hdf('../data/test_featured2.h5', 'df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block5_values] [items->['fecha_dato', 'ind_empleado', 'sexo', 'fecha_alta', 'ult_fec_cli_1t', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'indfall', 'nomprov', 'segmento']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


# Data from one month only

In [260]:
x_train = df_train[(df_train['int_date'] == 5)]
len_x_train = len(x_train)
print((len_x_train, pd.unique(x_train['fecha_dato'])))

(631957, array(['2015-05-28'], dtype=object))


In [261]:
x_train = utils.create_train_with_target(x_train, current_products)
x_train['target'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


21.0    7183
0.0     5355
16.0    4191
19.0    3430
20.0    3409
2.0     1987
10.0    1480
9.0     1005
11.0     438
7.0      435
15.0     343
4.0      308
5.0      298
17.0     213
6.0      174
3.0       84
8.0       25
13.0      24
14.0      13
12.0      11
1.0        7
18.0       4
Name: target, dtype: int64

# Model

In [262]:
def xgboost(X, Y, num_round, param):
    progress = dict()
    plst = list(param.items())
    xgtrain = xgb.DMatrix(X, label=Y)
    watchlist  = [(xgtrain,'train')]
    model = xgb.train(plst, xgtrain, num_round, watchlist, evals_result=progress)
    return (model, progress)

In [338]:
param = {
    'objective': 'multi:softprob',
    'eta': 0.05,
    'max_depth': 8,
    'silent': 1,
    'eval_metric': "mlogloss",
    'min_child_weight': 1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'seed': 0,
    'num_class': len(current_products),
}

In [302]:
drop_products = []
for step in range(1,6):
    for product in products[:2]:
        drop_products.append(product + '_prev' + str(step))

In [303]:
drop_columns = ['target','ncodpers', 'fecha_dato', 'ult_fec_cli_1t', 'nomprov', 'int_date',
          'indresi', 'indext', 'conyuemp', 'indfall', 'tiprel_1mes', 'ind_nuevo', 'fecha_alta', 
          'sexo', 'ind_empleado', 'segmento'
        ]

In [304]:
features = list(x_train.drop(drop_columns + products + drop_products, 1).columns.values)
np.array(features)

array(['pais_residencia', 'age', 'antiguedad', 'indrel', 'indrel_1mes',
       'canal_entrada', 'tipodom', 'cod_prov', 'ind_actividad_cliente',
       'renta', 'ind_cco_fin_ult1_prev1', 'ind_cder_fin_ult1_prev1',
       'ind_cno_fin_ult1_prev1', 'ind_ctju_fin_ult1_prev1',
       'ind_ctma_fin_ult1_prev1', 'ind_ctop_fin_ult1_prev1',
       'ind_ctpp_fin_ult1_prev1', 'ind_deco_fin_ult1_prev1',
       'ind_deme_fin_ult1_prev1', 'ind_dela_fin_ult1_prev1',
       'ind_ecue_fin_ult1_prev1', 'ind_fond_fin_ult1_prev1',
       'ind_hip_fin_ult1_prev1', 'ind_plan_fin_ult1_prev1',
       'ind_pres_fin_ult1_prev1', 'ind_reca_fin_ult1_prev1',
       'ind_tjcr_fin_ult1_prev1', 'ind_valo_fin_ult1_prev1',
       'ind_viv_fin_ult1_prev1', 'ind_nomina_ult1_prev1',
       'ind_nom_pens_ult1_prev1', 'ind_recibo_ult1_prev1',
       'ind_cco_fin_ult1_prev2', 'ind_cder_fin_ult1_prev2',
       'ind_cno_fin_ult1_prev2', 'ind_ctju_fin_ult1_prev2',
       'ind_ctma_fin_ult1_prev2', 'ind_ctop_fin_ult1_prev2',
   

In [305]:
Y = x_train['target']
X = x_train[features]

In [306]:
x_train = x_train.sample(frac=1).reset_index(drop=True) 

In [339]:
(model, progress) = xgboost(X, Y, 150, param)

[0]	train-mlogloss:2.86583
[1]	train-mlogloss:2.69872
[2]	train-mlogloss:2.55617
[3]	train-mlogloss:2.43484
[4]	train-mlogloss:2.33412
[5]	train-mlogloss:2.24588
[6]	train-mlogloss:2.16706
[7]	train-mlogloss:2.10306
[8]	train-mlogloss:2.04054
[9]	train-mlogloss:1.98642
[10]	train-mlogloss:1.93287
[11]	train-mlogloss:1.8809
[12]	train-mlogloss:1.83288
[13]	train-mlogloss:1.78937
[14]	train-mlogloss:1.74945
[15]	train-mlogloss:1.71159
[16]	train-mlogloss:1.67634
[17]	train-mlogloss:1.64425
[18]	train-mlogloss:1.6127
[19]	train-mlogloss:1.5817
[20]	train-mlogloss:1.5523
[21]	train-mlogloss:1.52471
[22]	train-mlogloss:1.49904
[23]	train-mlogloss:1.47365
[24]	train-mlogloss:1.44913
[25]	train-mlogloss:1.42709
[26]	train-mlogloss:1.40722
[27]	train-mlogloss:1.38681
[28]	train-mlogloss:1.36687
[29]	train-mlogloss:1.34832
[30]	train-mlogloss:1.33035
[31]	train-mlogloss:1.31295
[32]	train-mlogloss:1.2973
[33]	train-mlogloss:1.28182
[34]	train-mlogloss:1.26662
[35]	train-mlogloss:1.25324
[36]	tr

# Create Validation Set

In [277]:
x_valid = df_train[(df_train['int_date'] == 17)]
len_x_valid = len(x_valid)
print((len_x_valid, pd.unique(x_valid['fecha_dato'])))

(931453, array(['2016-05-28'], dtype=object))


In [278]:
x_valid = utils.create_train_with_target(x_valid, current_products)
x_valid['target'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


21.0    10220
20.0     5541
19.0     5516
0.0      5511
16.0     4252
10.0     2723
2.0      2369
4.0       713
15.0      283
5.0       226
17.0      183
6.0       131
3.0        77
11.0       61
9.0        46
13.0       22
18.0        7
14.0        7
1.0         5
12.0        3
Name: target, dtype: int64

# Prediction

In [370]:
x_eval = x_valid

id_preds = {}
ids = x_eval['ncodpers'].values

x_test = x_eval[features]
x_test = x_test.fillna(0) # check this
        
xg_test = xgb.DMatrix(x_test)
p_test = model.predict(xg_test)

for i, p in zip(ids, p_test):
    id_preds[i] = list(p)

In [371]:
# number of predicted products should equal to number of available products
assert len(current_products) == len(id_preds[658081])

In [372]:
products_prev = [product + '_prev1' for product in current_products]

In [373]:
df_sm_test = x_eval.loc[:,['ncodpers'] + products_prev]
df_sm_test['ncodpers'] = df_sm_test['ncodpers'].astype(str)

In [374]:
already_active = {}
for row in df_sm_test.values:
    row = list(row)
    _id = row.pop(0)
    active = [c[0] for c in zip(tuple(products_prev), row) if c[1] > 0]
    already_active[_id] = active

In [375]:
# add 7 products(that user don't have yet), higher probability first -> train_pred
train_preds = {}

for id_pred, p in id_preds.items():
    preds = [i[0] for i in sorted([i for i in zip(tuple(current_products), p) if i[0] not in already_active[str(id_pred)]],
                                  key=lambda i:i [1],
                                  reverse=True)[:7]]
    train_preds[id_pred] = preds

# MAP@7

### MAP code

In [285]:
def apk(actual, predicted, k=10, default=1.0):
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return default

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10, default=1.0):
    return np.mean([apk(a,p,k,default) for a,p in zip(actual, predicted)])

### Evaluation

In [286]:
from collections import defaultdict

In [376]:
true_labels = defaultdict(list)
for i, row in x_eval.iterrows():
    true_labels[row['ncodpers']].append(current_products[int(row['target'])])

In [377]:
actuals = []
predicts = []
for k,v in true_labels.items():
    actuals.append(v)
    predicts.append(train_preds[k])

In [378]:
mapk(actuals, predicts)

0.67991010134030727

# Submission

In [232]:
x_train = df_train[(df_train['int_date'] == 6)]
x_train = utils.create_train_with_target(x_train, current_products)
x_train['target'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


0.0     9457
21.0    9131
20.0    8229
19.0    5161
16.0    4755
15.0    2942
2.0     1934
10.0    1219
9.0     1085
7.0      503
4.0      349
11.0     246
5.0      222
17.0     159
6.0      154
3.0       55
8.0       33
13.0      21
1.0        9
14.0       8
12.0       4
18.0       3
Name: target, dtype: int64

In [233]:
Y = x_train['target']
X = x_train[features]
x_train = x_train.sample(frac=1).reset_index(drop=True)

In [234]:
(model, progress) = xgboost(X, Y, 200, param)

[0]	train-mlogloss:2.85452
[1]	train-mlogloss:2.67662
[2]	train-mlogloss:2.55013
[3]	train-mlogloss:2.43776
[4]	train-mlogloss:2.33574
[5]	train-mlogloss:2.248
[6]	train-mlogloss:2.16981
[7]	train-mlogloss:2.09844
[8]	train-mlogloss:2.03293
[9]	train-mlogloss:1.97035
[10]	train-mlogloss:1.91962
[11]	train-mlogloss:1.87039
[12]	train-mlogloss:1.82406
[13]	train-mlogloss:1.77844
[14]	train-mlogloss:1.73832
[15]	train-mlogloss:1.70093
[16]	train-mlogloss:1.66391
[17]	train-mlogloss:1.62872
[18]	train-mlogloss:1.59668
[19]	train-mlogloss:1.56645
[20]	train-mlogloss:1.53818
[21]	train-mlogloss:1.51011
[22]	train-mlogloss:1.48361
[23]	train-mlogloss:1.45974
[24]	train-mlogloss:1.43699
[25]	train-mlogloss:1.41557
[26]	train-mlogloss:1.39491
[27]	train-mlogloss:1.37554
[28]	train-mlogloss:1.35725
[29]	train-mlogloss:1.33908
[30]	train-mlogloss:1.32086
[31]	train-mlogloss:1.30365
[32]	train-mlogloss:1.28728
[33]	train-mlogloss:1.27117
[34]	train-mlogloss:1.25601
[35]	train-mlogloss:1.24154
[36]

In [235]:
id_preds = {}
ids = df_test['ncodpers'].values

x_test = df_test[features]
x_test = x_test.fillna(0) # check this
        
xg_test = xgb.DMatrix(x_test)
p_test = model.predict(xg_test)

for i, p in zip(ids, p_test):
    id_preds[i] = list(p)

In [236]:
sample = pd.read_csv('../data/sample_submission.csv')

In [237]:
products_prev = [product + '_prev1' for product in current_products]

In [248]:
df_sm_test = df_test.loc[:,['ncodpers'] + products_prev]
df_sm_test['ncodpers'] = df_sm_test['ncodpers'].astype(str)

In [249]:
already_active = {}
for row in df_sm_test.values:
    row = list(row)
    _id = row.pop(0)
    active = [c[0] for c in zip(tuple(products_prev), row) if c[1] > 0]
    already_active[_id] = active

In [250]:
# add 7 products(that user don't have yet), higher probability first -> train_pred
train_preds = {}

for id_pred, p in id_preds.items():
    preds = [i[0] for i in sorted([i for i in zip(tuple(current_products), p) if i[0] not in already_active[str(id_pred)]],
                                  key=lambda i:i [1],
                                  reverse=True)[:7]]
    train_preds[id_pred] = preds

In [251]:
test_preds = []
for row in sample.values:
    _id = row[0]
    p = train_preds[_id]
    test_preds.append(' '.join(p))

sample['added_products'] = test_preds

In [252]:
sample.shape

(929615, 2)

In [253]:
sample['added_products'] = test_preds
sample.to_csv('../data/submission.csv', index=False)