In [1]:
import datetime
import numpy as np
import pandas as pd

In [2]:
def apk(actual, predicted, k=10):

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
data_types = {
    'ncodpers': np.int32, 
    'conyuemp': np.int8,
    'indfall': np.int8,
    'tipodom': np.int8,
    'indext': np.int8,
    'indresi': np.int8,
    'pais_residencia': np.int8,
    'segmento': np.int8,
    'canal_entrada': np.int8,
    'indrel_1mes': np.int8,
    'sexo': np.int8,
    'ind_empleado': np.int8,
    'nomprov': np.int8,
    'tiprel_1mes': np.int8,
    'ind_ahor_fin_ult1': np.int8, 
    'ind_aval_fin_ult1': np.int8, 
    'ind_cco_fin_ult1': np.int8, 
    'ind_cder_fin_ult1': np.int8, 
    'ind_cno_fin_ult1': np.int8, 
    'ind_ctju_fin_ult1': np.int8, 
    'ind_ctma_fin_ult1': np.int8, 
    'ind_ctop_fin_ult1': np.int8, 
    'ind_ctpp_fin_ult1': np.int8, 
    'ind_deco_fin_ult1': np.int8, 
    'ind_deme_fin_ult1': np.int8, 
    'ind_dela_fin_ult1': np.int8, 
    'ind_ecue_fin_ult1': np.int8, 
    'ind_fond_fin_ult1': np.int8, 
    'ind_hip_fin_ult1': np.int8, 
    'ind_plan_fin_ult1': np.int8, 
    'ind_pres_fin_ult1': np.int8, 
    'ind_reca_fin_ult1': np.int8, 
    'ind_tjcr_fin_ult1': np.int8, 
    'ind_valo_fin_ult1': np.int8, 
    'ind_nomina_ult1': np.int8,
    'ind_nom_pens_ult1': np.int8,
    'ind_viv_fin_ult1': np.int8, 
    'ind_recibo_ult1': np.int8
}

dates = ['fecha_dato', 'fecha_alta', 'ult_fec_cli_1t']

In [4]:
data = pd.read_csv('data/train_preprocessed.csv', parse_dates=dates, dtype=data_types)

In [5]:
data_te = pd.read_csv('data/test_preprocessed.csv', parse_dates=dates, dtype=data_types)

In [6]:
target_cols = [
    'ind_ahor_fin_ult1',
    'ind_aval_fin_ult1',
    'ind_cco_fin_ult1',
    'ind_cder_fin_ult1',
    'ind_cno_fin_ult1',
    'ind_ctju_fin_ult1',
    'ind_ctma_fin_ult1',
    'ind_ctop_fin_ult1',
    'ind_ctpp_fin_ult1',
    'ind_deco_fin_ult1',
    'ind_deme_fin_ult1',
    'ind_dela_fin_ult1',
    'ind_ecue_fin_ult1',
    'ind_fond_fin_ult1',
    'ind_hip_fin_ult1',
    'ind_plan_fin_ult1',
    'ind_pres_fin_ult1',
    'ind_reca_fin_ult1',
    'ind_tjcr_fin_ult1',
    'ind_valo_fin_ult1',
    'ind_viv_fin_ult1',
    'ind_nomina_ult1',
    'ind_nom_pens_ult1',
    'ind_recibo_ult1'
]


feature_cols = list(set(data.columns) - set(target_cols) - set(['ncodpers']) - set(dates))

In [6]:
data.fecha_dato.unique()

array(['2015-01-28T00:00:00.000000000', '2015-02-28T00:00:00.000000000',
       '2015-03-28T00:00:00.000000000', '2015-04-28T00:00:00.000000000',
       '2015-05-28T00:00:00.000000000', '2015-06-28T00:00:00.000000000',
       '2015-07-28T00:00:00.000000000', '2015-08-28T00:00:00.000000000',
       '2015-09-28T00:00:00.000000000', '2015-10-28T00:00:00.000000000',
       '2015-11-28T00:00:00.000000000', '2015-12-28T00:00:00.000000000',
       '2016-01-28T00:00:00.000000000', '2016-02-28T00:00:00.000000000',
       '2016-03-28T00:00:00.000000000', '2016-04-28T00:00:00.000000000',
       '2016-05-28T00:00:00.000000000'], dtype='datetime64[ns]')

In [20]:
# # valid
# train_index = np.array(data.fecha_dato != datetime.datetime(2016, 5, 28))

# valid_index = np.array(data.fecha_dato == datetime.datetime(2016, 5, 28))

# last_month_index = np.array(data.fecha_dato == datetime.datetime(2016, 4, 28))

In [7]:
# valid

last_month_index = np.array(data.fecha_dato == datetime.datetime(2016, 5, 28))

In [8]:
rating = data.ix[:, target_cols].sum(axis=0).values

In [9]:
sorted_rating = np.argsort(rating)[::-1]

In [72]:
# preds = []
# trues = []
# all_prods = set(np.arange(rating.size))
# for target in data.ix[last_month_index, target_cols].values:
#     used_prods = np.where(target == 1)[0]
#     pred = []
#     for p in sorted_rating:
#         if p not in used_prods:
#             pred += [p]
            
#         if len(pred) == 7:
#             break
            
#     preds += [pred]
    
# # for target in data.ix[valid_index, target_cols].values:
# #     trues += [set(np.where(target == 1)[0])]

In [72]:
preds = []
for user_id in data_te.ncodpers.values:
    indx = last_data.ncodpers == user_id
    ld = last_data.ix[indx, target_cols].values
    if ld.size > 0:
        used_prods = np.where(ld[0] == 1)[0]
        pred = []
        for p in sorted_rating:
            if p not in used_prods:
                pred += [p]

            if len(pred) == 7:
                break

    else:
        pred = sorted_rating[:7]

    preds += [pred]

# for target in data.ix[valid_index, target_cols].values:
#     trues += [set(np.where(target == 1)[0])]

In [78]:
target_cols = np.array([
    'ind_ahor_fin_ult1',
    'ind_aval_fin_ult1',
    'ind_cco_fin_ult1',
    'ind_cder_fin_ult1',
    'ind_cno_fin_ult1',
    'ind_ctju_fin_ult1',
    'ind_ctma_fin_ult1',
    'ind_ctop_fin_ult1',
    'ind_ctpp_fin_ult1',
    'ind_deco_fin_ult1',
    'ind_deme_fin_ult1',
    'ind_dela_fin_ult1',
    'ind_ecue_fin_ult1',
    'ind_fond_fin_ult1',
    'ind_hip_fin_ult1',
    'ind_plan_fin_ult1',
    'ind_pres_fin_ult1',
    'ind_reca_fin_ult1',
    'ind_tjcr_fin_ult1',
    'ind_valo_fin_ult1',
    'ind_viv_fin_ult1',
    'ind_nomina_ult1',
    'ind_nom_pens_ult1',
    'ind_recibo_ult1'
])

In [80]:
final_preds = [' '.join(list(target_cols[pred])) for pred in preds]
out = pd.DataFrame({'ncodpers': data_te.ncodpers.values, 'added_products': final_preds})

In [82]:
out.to_csv('data/submission.csv', index=False)

In [91]:
for pred in preds:
    if len(pred) != 7:
        print(len(pred))

In [74]:
trues[:20]

[set(),
 {2},
 {2},
 {4, 12, 18, 21, 22, 23},
 {2, 11},
 set(),
 {2, 8, 11, 18},
 {2},
 {2},
 set(),
 {11},
 {2},
 {4, 8, 17},
 {8},
 {2},
 {2},
 {2},
 set(),
 set(),
 {2, 7}]

In [76]:
mapk(trues, preds, 7)

0.27507033043946871

In [None]:
y_pred = np.argsort(y_pred, axis=1)
y_pred = np.fliplr(y_pred)

preds = []
trues = []
for i in range(y_pred.shape[0]):
    cust_id = ncodpers[i]
    used_products = last_products.get(cust_id, {})
    
    pred_top_products = []
    for product_id in y_pred[i]:
        if product_id not in used_products:
            pred_top_products.append(product_id)
        if len(pred_top_products) == 7:
            break
    
    products = np.arange(0, len(y_valid[i]))[y_valid[i].astype(bool)]
    true_top_products = []
    for product_id in products:
        if product_id not in used_products:
            true_top_products.append(product_id)
    
    preds.append(pred_top_products)
    trues.append(true_top_products)