In [1]:
import datetime
import numpy as np
import pandas as pd

In [2]:
def apk(actual, predicted, k=10):

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [3]:
data_types = {
    'ncodpers': np.int32, 
    'conyuemp': np.int8,
    'indfall': np.int8,
    'tipodom': np.int8,
    'indext': np.int8,
    'indresi': np.int8,
    'pais_residencia': np.int8,
    'segmento': np.int8,
    'canal_entrada': np.int8,
    'indrel_1mes': np.int8,
    'sexo': np.int8,
    'ind_empleado': np.int8,
    'nomprov': np.int8,
    'tiprel_1mes': np.int8,
    'ind_ahor_fin_ult1': np.int8, 
    'ind_aval_fin_ult1': np.int8, 
    'ind_cco_fin_ult1': np.int8, 
    'ind_cder_fin_ult1': np.int8, 
    'ind_cno_fin_ult1': np.int8, 
    'ind_ctju_fin_ult1': np.int8, 
    'ind_ctma_fin_ult1': np.int8, 
    'ind_ctop_fin_ult1': np.int8, 
    'ind_ctpp_fin_ult1': np.int8, 
    'ind_deco_fin_ult1': np.int8, 
    'ind_deme_fin_ult1': np.int8, 
    'ind_dela_fin_ult1': np.int8, 
    'ind_ecue_fin_ult1': np.int8, 
    'ind_fond_fin_ult1': np.int8, 
    'ind_hip_fin_ult1': np.int8, 
    'ind_plan_fin_ult1': np.int8, 
    'ind_pres_fin_ult1': np.int8, 
    'ind_reca_fin_ult1': np.int8, 
    'ind_tjcr_fin_ult1': np.int8, 
    'ind_valo_fin_ult1': np.int8, 
    'ind_nomina_ult1': np.int8,
    'ind_nom_pens_ult1': np.int8,
    'ind_viv_fin_ult1': np.int8, 
    'ind_recibo_ult1': np.int8
}

dates = ['fecha_dato', 'fecha_alta', 'ult_fec_cli_1t']

In [4]:
data = pd.read_csv('data/train_preprocessed.csv', parse_dates=dates, dtype=data_types)

In [7]:
data_test = pd.read_csv('data/test_preprocessed.csv', parse_dates=dates, dtype=data_types)

In [5]:
target_cols = np.array([
    'ind_ahor_fin_ult1',
    'ind_aval_fin_ult1',
    'ind_cco_fin_ult1',
    'ind_cder_fin_ult1',
    'ind_cno_fin_ult1',
    'ind_ctju_fin_ult1',
    'ind_ctma_fin_ult1',
    'ind_ctop_fin_ult1',
    'ind_ctpp_fin_ult1',
    'ind_deco_fin_ult1',
    'ind_deme_fin_ult1',
    'ind_dela_fin_ult1',
    'ind_ecue_fin_ult1',
    'ind_fond_fin_ult1',
    'ind_hip_fin_ult1',
    'ind_plan_fin_ult1',
    'ind_pres_fin_ult1',
    'ind_reca_fin_ult1',
    'ind_tjcr_fin_ult1',
    'ind_valo_fin_ult1',
    'ind_viv_fin_ult1',
    'ind_nomina_ult1',
    'ind_nom_pens_ult1',
    'ind_recibo_ult1'
])

feature_cols = list(set(data.columns) - set(target_cols) - set(['ncodpers']) - set(dates))

In [6]:
data.fecha_dato.unique()

array(['2015-01-28T00:00:00.000000000', '2015-02-28T00:00:00.000000000',
       '2015-03-28T00:00:00.000000000', '2015-04-28T00:00:00.000000000',
       '2015-05-28T00:00:00.000000000', '2015-06-28T00:00:00.000000000',
       '2015-07-28T00:00:00.000000000', '2015-08-28T00:00:00.000000000',
       '2015-09-28T00:00:00.000000000', '2015-10-28T00:00:00.000000000',
       '2015-11-28T00:00:00.000000000', '2015-12-28T00:00:00.000000000',
       '2016-01-28T00:00:00.000000000', '2016-02-28T00:00:00.000000000',
       '2016-03-28T00:00:00.000000000', '2016-04-28T00:00:00.000000000',
       '2016-05-28T00:00:00.000000000'], dtype='datetime64[ns]')

In [86]:
# valid
train_index = np.array((data.fecha_dato == datetime.datetime(2015, 4, 28)))
train_index_2 = np.array((data.fecha_dato == datetime.datetime(2015, 5, 28)))

last_month_index = np.array(data.fecha_dato == datetime.datetime(2016, 4, 28))

valid_index = np.array((data.fecha_dato == datetime.datetime(2016, 5, 28)))


ncodpers = data.ix[valid_index].ncodpers.values
last_month_target = np.array(data.ix[last_month_index, ['ncodpers'] + list(target_cols)])

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
import xgboost as xgb



In [87]:
may = data.ix[train_index]
june = data.ix[train_index_2]

customers = set(may.ncodpers).intersection(set(june.ncodpers))

X_train = pd.merge(
    may.ix[may.ncodpers.isin(customers), ['ncodpers'] + target_cols.tolist()],
    june.ix[june.ncodpers.isin(customers), ['ncodpers'] + feature_cols],
    on='ncodpers'
).ix[:, target_cols.tolist() + feature_cols].values


y_train = june.ix[june.ncodpers.isin(customers), target_cols].values


may_test = data.ix[last_month_index]
june_test = data.ix[valid_index]

customers = set(may_test.ncodpers).intersection(set(june_test.ncodpers))

X_test = pd.merge(
    may_test.ix[may_test.ncodpers.isin(customers), ['ncodpers'] + target_cols.tolist()],
    june_test.ix[june_test.ncodpers.isin(customers), ['ncodpers'] + feature_cols],
    on='ncodpers'
).ix[:, target_cols.tolist() + feature_cols].values

In [89]:
clf = xgb.XGBClassifier() #n_estimators=50, max_depth=5)
# # clf = RandomForestClassifier(n_estimators=30, max_depth=20, criterion='entropy', n_jobs=-1, random_state=5)
preds = []
for i in range(y_train.shape[1]):
    clf.fit(X_train, y_train[:, i])
    preds += [clf.predict_proba(X_test)]

In [90]:
# y_pred = np.array(clf.predict_proba(data.ix[valid_index, feature_cols]))[:, :, 1].T
y_pred = np.zeros((X_test.shape[0], y_train.shape[1]))
for i, pred in enumerate(preds):
    y_pred[:, i] = pred[:, 1]

In [78]:
# y_pred_bu_bu = y_pred.copy()

In [23]:
# y_pred_bu = y_pred.copy()

In [79]:
X_train_1 = data.ix[last_month_index, feature_cols].values
y_train_1 = data.ix[last_month_index, target_cols].values

X_test_1 = data_test.ix[:, feature_cols].values

In [80]:
clf = xgb.XGBClassifier() # n_estimators=50, max_depth=5)
preds = []
for i in range(y_train_1.shape[1]):
    clf.fit(X_train_1, y_train_1[:, i])
    preds += [clf.predict_proba(X_test_1)]

In [81]:
y_pred_1 = np.zeros((X_test_1.shape[0], y_train_1.shape[1]))
for i, pred in enumerate(preds):
    y_pred_1[:, i] = pred[:, 1]

In [82]:
y_pred_1_bu_bu = y_pred_1.copy()

In [29]:
# y_pred_1_bu = y_pred_1.copy()

In [91]:
y_pred_fin = y_pred.copy() # + y_pred_1.copy()

In [92]:
last_products = {}
for i in range(last_month_target.shape[0]):
    row = last_month_target[i]
    cust_id = row[0]
    used_products = set(np.where(row[1:] == 1)[0])
    last_products[cust_id] = used_products

In [85]:
y_pred = np.argsort(y_pred_fin, axis=1)
y_pred = np.fliplr(y_pred)

preds = []
for i in range(y_pred.shape[0]):
    cust_id = ncodpers[i]
    used_products = last_products.get(cust_id, {})
    
    pred_top_products = []
    for product_id in y_pred[i]:
        if product_id not in used_products:
            pred_top_products.append(product_id)
        if len(pred_top_products) == 7:
            break 
   
    preds.append(np.array(pred_top_products))
    
final_preds = [' '.join(list(target_cols[pred])) for pred in preds]
out = pd.DataFrame({'ncodpers': ncodpers, 'added_products': final_preds})
out.to_csv('xgb_last_month.csv', index=False)

In [93]:
y_valid = data.ix[valid_index, target_cols].values

In [94]:
y_pred = np.argsort(y_pred_fin, axis=1)
y_pred = np.fliplr(y_pred)

preds = []
trues = []
for i in range(y_pred.shape[0]):
    cust_id = ncodpers[i]
    used_products = last_products.get(cust_id, {})
    
    pred_top_products = []
    for product_id in y_pred[i]:
        if product_id not in used_products:
            pred_top_products.append(product_id)
        if len(pred_top_products) == 7:
            break

    products = np.arange(0, len(y_valid[i]))[y_valid[i].astype(bool)]
    true_top_products = []
    for product_id in products:
        if product_id not in used_products:
            true_top_products.append(product_id)
    
    preds.append(pred_top_products)
    trues.append(true_top_products)

In [95]:
mapk(trues, preds, k=7)

0.017250230640686506

In [51]:
mapk(trues, preds, k=7)

0.33580655739295079

In [6]:
# test
last_month_index = np.array(data.fecha_dato == datetime.datetime(2016, 5, 28))

ncodpers = data_test.ncodpers.values
last_month_target = data.ix[last_month_index, ['ncodpers'] + list(target_cols)].values

In [8]:
X_train = data.ix[:, feature_cols].values
y_train = data.ix[:, target_cols].values

X_test = data_test[feature_cols].values

In [9]:
np.save('data/X_train.npy', X_train)
np.save('data/y_train.npy', y_train)

np.save('data/X_test.npy', X_test)

np.save('data/ncodpers.npy', ncodpers)
np.save('data/last_month_target.npy', last_month_target)

In [1]:
import datetime
import numpy as np
import pandas as pd

In [2]:
X_train = np.load('data/X_train.npy')
y_train = np.load('data/y_train.npy')

X_test = np.load('data/X_test.npy')

ncodpers = np.load('data/ncodpers.npy')
last_month_target = np.load('data/last_month_target.npy')

In [14]:
ncodpers.shape

(13619575,)

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
clf = RandomForestClassifier(n_estimators=30, criterion='entropy', n_jobs=60)
clf.fit(X_train, y_train)
# y_pred = np.array(clf.predict_proba(data.ix[valid_index, feature_cols].values))[:, :, 1].T

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=30, n_jobs=60, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [5]:
y_pred = np.array(clf.predict_proba(X_test))[:, :, 1].T

In [17]:
# np.save('data/y_test_30est.npy', y_pred)
y_pred = np.load('data/y_test_30est.npy')

In [10]:
target_cols = np.array([
    'ind_ahor_fin_ult1',
    'ind_aval_fin_ult1',
    'ind_cco_fin_ult1',
    'ind_cder_fin_ult1',
    'ind_cno_fin_ult1',
    'ind_ctju_fin_ult1',
    'ind_ctma_fin_ult1',
    'ind_ctop_fin_ult1',
    'ind_ctpp_fin_ult1',
    'ind_deco_fin_ult1',
    'ind_deme_fin_ult1',
    'ind_dela_fin_ult1',
    'ind_ecue_fin_ult1',
    'ind_fond_fin_ult1',
    'ind_hip_fin_ult1',
    'ind_plan_fin_ult1',
    'ind_pres_fin_ult1',
    'ind_reca_fin_ult1',
    'ind_tjcr_fin_ult1',
    'ind_valo_fin_ult1',
    'ind_viv_fin_ult1',
    'ind_nomina_ult1',
    'ind_nom_pens_ult1',
    'ind_recibo_ult1'
])

In [19]:
last_products = {}
for i in range(last_month_target.shape[0]):
    row = last_month_target[i]
    cust_id = row[0]
    used_products = set(np.where(row[1:] == 1)[0])
    last_products[cust_id] = used_products

In [18]:
data_test = pd.read_csv('data/test_preprocessed.csv', parse_dates=dates, dtype=data_types)
ncodpers = data_test.ncodpers

In [20]:
y_pred = np.argsort(y_pred, axis=1)
y_pred = np.fliplr(y_pred)

preds = []
for i in range(y_pred.shape[0]):
    cust_id = ncodpers[i]
    used_products = last_products.get(cust_id, {})
    
    pred_top_products = []
    for product_id in y_pred[i]:
        if product_id not in used_products:
            pred_top_products.append(product_id)
        if len(pred_top_products) == 7:
            break 
   
    preds.append(np.array(pred_top_products))
    
final_preds = [' '.join(list(target_cols[pred])) for pred in preds]
out = pd.DataFrame({'ncodpers': ncodpers, 'added_products': final_preds})
out.to_csv('submission.csv', index=False)


In [12]:
y_pred = np.argsort(y_pred, axis=1)
y_pred = np.fliplr(y_pred)

preds = []
trues = []
for i in range(y_pred.shape[0]):
    cust_id = ncodpers[i]
    used_products = last_products.get(cust_id, {})
    
    pred_top_products = []
    for product_id in y_pred[i]:
        if product_id not in used_products:
            pred_top_products.append(product_id)
        if len(pred_top_products) == 7:
            break
    
    products = np.arange(0, len(y_valid[i]))[y_valid[i].astype(bool)]
    true_top_products = []
    for product_id in products:
        if product_id not in used_products:
            true_top_products.append(product_id)
    
    preds.append(pred_top_products)
    trues.append(true_top_products)

NameError: name 'y_valid' is not defined