In [69]:
import cudf
import lightgbm as lgb
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [70]:
base_dir = 'storage/output/220321_baseline/'

****Prepare Dataset****

In [71]:
%%time 
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

CPU times: user 666 ms, sys: 989 ms, total: 1.66 s
Wall time: 2.35 s


In [72]:
customers['FN'].fillna(0.,inplace=True)
customers['Active'].fillna(0.,inplace=True)
customers['club_member_status'].fillna('None',inplace=True)
customers['age'] = customers['age'] / 10
customers['age'] = customers['age'].astype(int)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].str.lower().fillna('none')

In [73]:
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])

In [125]:
def past_purchase_feature(df,transactions):
    transactions['count'] = 1
    
    time_elapsed_last_purchase = transactions['t_dat'].max()-transactions[['customer_id','article_id','t_dat']].groupby(['customer_id','article_id'])['t_dat'].max()
    time_elapsed_last_purchase = time_elapsed_last_purchase.dt.days
    df = df.merge(time_elapsed_last_purchase,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'t_dat':'time_elapsed_last_purchase'})
    df['time_elapsed_last_purchase'].fillna(1e6,inplace=True)
    
    time_elapsed_first_release = transactions[['customer_id','article_id','t_dat']].groupby(['customer_id','article_id'])['t_dat'].min()-cudf.to_datetime('2018-09-01')
    time_elapsed_first_release = time_elapsed_first_release.dt.days
    df = df.merge(time_elapsed_first_release,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'t_dat':'time_elapsed_first_release'})
    df['time_elapsed_first_release'].fillna(1e6,inplace=True)
    
    past_purchase_prob = transactions[['customer_id','article_id','count']].groupby(['customer_id','article_id'])['count'].count().reset_index()
    norm = transactions[['customer_id','article_id']].groupby('customer_id').count().reset_index().rename(columns={'article_id':'norm'})
    past_purchase_prob = past_purchase_prob.merge(norm,on='customer_id')
    past_purchase_prob['count'] = past_purchase_prob['count'] / past_purchase_prob['norm']
    past_purchase_prob.drop(columns=['norm'],inplace=True)
    df = df.merge(past_purchase_prob,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'count':'past_purchase_prob'})
    df['past_purchase_prob'].fillna(0.,inplace=True)
    
    total_purchase = transactions[['article_id','count']].groupby('article_id')['count'].count().reset_index().rename(columns={'count':'total_purchase'})
    norm = transactions['count'].sum()
    total_purchase['total_purchase'] = total_purchase['total_purchase'] / norm
    df = df.merge(total_purchase,on='article_id',how='left')
    df['total_purchase'].fillna(0.,inplace=True)
    
    number_of_purchase = transactions[['customer_id','count']].groupby('customer_id')['count'].count().reset_index().rename(columns={'count':'number_of_purchase'})
    df = df.merge(number_of_purchase,on='customer_id',how='left')
    df['number_of_purchase'].fillna(0.,inplace=True)
    
    repeated_purchase = transactions[['customer_id','article_id','count']].groupby(['customer_id','article_id'])['count'].count().reset_index().rename(columns={'count':'repeated_purchase'})
    df = df.merge(repeated_purchase,on=['customer_id','article_id'],how='left')

    return df
    
def article_feature_prob_vector(df,transactions,articles,article_features,postfix='_prob'):
    transactions['count'] = 1
    for article_feature in article_features:
        transactions = transactions.merge(articles[['article_id',article_feature]],on='article_id',how='left')
        norm = transactions.groupby(['customer_id'])['count'].count().reset_index()
        norm.rename(columns={'count':'norm'},inplace=True)
        count = transactions.groupby(['customer_id',article_feature])['count'].count().reset_index()
        count = count.merge(norm,on='customer_id')
        count['count'] = count['count'] / count['norm']
        count = count.rename(columns={'count':article_feature+postfix})
        count = count[['customer_id',article_feature,article_feature+postfix]]
        del(norm)
        df = df.merge(articles[['article_id',article_feature]],on='article_id',how='left')
        df = df.merge(count,on=['customer_id',article_feature],how='left')
    return df

def customer_feature_prob_vector(df,transactions,customers,customer_features,postfix='_prob'):
    transactions['count'] = 1
    for customer_feature in customer_features:
        transactions = transactions.merge(customers[['customer_id',customer_feature]],on='customer_id',how='left')
        norm = transactions.groupby(['article_id'])['count'].count().reset_index()
        norm.rename(columns={'count':'norm'},inplace=True)
        count = transactions.groupby(['article_id',customer_feature])['count'].count().reset_index()
        count = count.merge(norm,on='article_id')
        count['count'] = count['count'] / count['norm']
        count = count.rename(columns={'count':customer_feature+postfix})
        count = count[['article_id',customer_feature,customer_feature+postfix]]
        del(norm)
        df = df.merge(customers[['customer_id',customer_feature]],on='customer_id',how='left')
        df = df.merge(count,on=['article_id',customer_feature],how='left')
    return df

def construct_feature_df(
        df,transactions,
        article_features,
        articles,
        customer_features,
        customers,
        general_features=['article_id','customer_id'],
    ):
    df = article_feature_prob_vector(df,transactions,articles,article_features)
    df = customer_feature_prob_vector(df,transactions,customers,customer_features)
    df = past_purchase_feature(df,transactions)
    df = df[
            general_features+[f for f in df.columns if '_prob' in f] + 
            ['total_purchase','time_elapsed_last_purchase','past_purchase_prob','number_of_purchase','time_elapsed_first_release','repeated_purchase']
        ]
    return df

def construct_candidate_dict(transactions_3w):
    purchase_dict_3w = {}
    for i,x in enumerate(zip(transactions_3w['customer_id'], transactions_3w['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict_3w:
            purchase_dict_3w[cust_id] = {}
        if art_id not in purchase_dict_3w[cust_id]:
            purchase_dict_3w[cust_id][art_id] = 0
        purchase_dict_3w[cust_id][art_id] += 1
    dummy_list_3w = list((transactions_3w['article_id'].value_counts()).index)[:12]
    return purchase_dict_3w,dummy_list_3w

def get_week(purchase_dicts,article_id):
    for i,purchase_dict in enumerate(purchase_dicts):
        if article_id in purchase_dict: return i
    return 1e6

def construct_candidate_df(test_df,transactions,add_random_samples=False):
    
    bool_1w = transactions.t_dat>transactions.t_dat.max()-pd.Timedelta(7,unit='day')
    bool_2w = (transactions.t_dat>transactions.t_dat.max()-2*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-pd.Timedelta(7,unit='day'))
    bool_3w = (transactions.t_dat>transactions.t_dat.max()-3*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-2*pd.Timedelta(7,unit='day'))
    bool_4w = (transactions.t_dat>transactions.t_dat.max()-4*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-3*pd.Timedelta(7,unit='day'))
    bool_5w = (transactions.t_dat>transactions.t_dat.max()-5*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-4*pd.Timedelta(7,unit='day'))
    bool_6w = (transactions.t_dat>transactions.t_dat.max()-6*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-5*pd.Timedelta(7,unit='day'))
    bool_7w = (transactions.t_dat>transactions.t_dat.max()-7*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-6*pd.Timedelta(7,unit='day'))
    
    transactions_7w = transactions[bool_7w]
    transactions_6w = transactions[bool_6w]
    transactions_5w = transactions[bool_5w]
    transactions_4w = transactions[bool_4w]
    transactions_3w = transactions[bool_3w]
    transactions_2w = transactions[bool_2w]
    transactions_1w = transactions[bool_1w]
    
    transactions_1w = transactions[bool_1w].to_pandas()
    transactions_2w = transactions[bool_2w].to_pandas()
    transactions_3w = transactions[bool_3w].to_pandas()
    transactions_4w = transactions[bool_4w].to_pandas()
    transactions_5w = transactions[bool_5w].to_pandas()
    transactions_6w = transactions[bool_6w].to_pandas()
    transactions_7w = transactions[bool_7w].to_pandas()
    
    purchase_dict_1w,dummy_list_1w = construct_candidate_dict(transactions_1w)
    purchase_dict_2w,_ = construct_candidate_dict(transactions_2w)
    purchase_dict_3w,_ = construct_candidate_dict(transactions_3w)
    purchase_dict_4w,_ = construct_candidate_dict(transactions_4w)
    purchase_dict_5w,_ = construct_candidate_dict(transactions_5w)
    purchase_dict_6w,_ = construct_candidate_dict(transactions_6w)
    purchase_dict_7w,_ = construct_candidate_dict(transactions_7w)
    
    pred_df = pd.DataFrame()
    pred_df['customer_id'] = test_df['customer_id'].unique()
    
    prediction_list = []
    week_list = []
    
    if add_random_samples:
        dummy_pred = transactions['article_id'].sample(frac=1.).to_arrow().to_pylist()[:50]
    else:
        dummy_pred = (transactions_1w['article_id'].value_counts()).index.tolist()[:12]
        dummy_pred += (transactions_2w['article_id'].value_counts()).index.tolist()[:12]
        dummy_pred += (transactions_3w['article_id'].value_counts()).index.tolist()[:12]
        dummy_pred += (transactions_4w['article_id'].value_counts()).index.tolist()[:12]
        dummy_pred += (transactions_5w['article_id'].value_counts()).index.tolist()[:12]
        dummy_pred += (transactions_6w['article_id'].value_counts()).index.tolist()[:12]
        dummy_pred += (transactions_7w['article_id'].value_counts()).index.tolist()[:12]
        #dummy_pred += (transactions['article_id'].value_counts()).index.to_arrow().to_pylist()[:12]
    
    for i, cust_id in enumerate(pred_df['customer_id']):
        s = []
        purchase_dicts = []
        if cust_id in purchase_dict_1w:
            l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            s += l
            purchase_dicts.append(purchase_dict_1w[cust_id])
            
        if cust_id in purchase_dict_2w:
            l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            s += l
            purchase_dicts.append(purchase_dict_2w[cust_id])

        if cust_id in purchase_dict_3w:
            l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            s += l
            purchase_dicts.append(purchase_dict_3w[cust_id])

        if cust_id in purchase_dict_4w:
            l = sorted((purchase_dict_4w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            s += l
            purchase_dicts.append(purchase_dict_4w[cust_id])

        if cust_id in purchase_dict_5w:
            l = sorted((purchase_dict_5w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            s += l
            purchase_dicts.append(purchase_dict_5w[cust_id])

        if cust_id in purchase_dict_6w:
            l = sorted((purchase_dict_6w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            s += l
            purchase_dicts.append(purchase_dict_6w[cust_id])

        if cust_id in purchase_dict_7w:
            l = sorted((purchase_dict_7w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            s += l
            purchase_dicts.append(purchase_dict_7w[cust_id])

        s += dummy_pred
        
        s = list(set(s))
        prediction_list.append(s)
        
        week_list.append([get_week(purchase_dicts,aid) for aid in s])
        
    pred_df['article_id'] = prediction_list
    pred_df['week'] = week_list
    
    return pred_df
    
def construct_val_df(test_df,transactions,article_features,articles,customer_features,customers,how='outer',add_random_samples=False):
    pos_df = test_df.groupby('customer_id')['article_id'].unique().to_frame().reset_index().explode('article_id')
    pos_df['label'] = 1
    test_df = construct_candidate_df(test_df.to_pandas(),transactions,add_random_samples=add_random_samples).explode('article_id').reset_index(drop=True)
    test_df = test_df.merge(pos_df.to_pandas(),on=['article_id','customer_id'],how=how)
    test_df['label'].fillna(0,inplace=True)
    test_df = cudf.from_pandas(test_df)
    test_df = construct_feature_df(test_df,transactions,article_features,articles,customer_features,customers,general_features=['article_id','customer_id','label'])
    test_df = test_df.fillna(0.)
    test_df['article_id'] = test_df['article_id'].astype(int)
    test_df = test_df.sort_values(['customer_id','article_id']).reset_index(drop=True)
    return test_df

def construct_test_df(test_df,transactions,article_features,articles,customer_features,customers,how='outer',add_random_samples=False):
    test_df = construct_candidate_df(test_df.to_pandas(),transactions,add_random_samples=add_random_samples).explode(['article_id','week']).reset_index(drop=True)
    test_df = cudf.from_pandas(test_df)
    test_df = construct_feature_df(test_df,transactions,article_features,articles,customer_features,customers,general_features=['article_id','customer_id'])
    test_df = test_df.fillna(0.)
    test_df['article_id'] = test_df['article_id'].astype(int)
    test_df = test_df.sort_values(['customer_id','article_id']).reset_index(drop=True)
    return test_df

def construct_gt_df(test_transactions):
    gt_df = test_transactions.to_pandas().groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df
    
def construct_dataset(
        transactions,
        articles,customers,
        trn_start_time='2020-08-31',trn_end_time='2020-09-08',
        val_start_time='2020-09-08',val_end_time='2020-09-15',
        test_start_time='2020-09-08',test_end_time='2020-09-15',
        article_features=[
            'product_group_name', 'product_type_name', 
            'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
            'index_name', 'index_group_name', 
            'section_name', 'department_name',
        ],
        customer_features=[
            'FN','Active','club_member_status','age','fashion_news_frequency',
        ],
    ):
    
    trn_start_time = cudf.to_datetime(trn_start_time)
    trn_end_time = cudf.to_datetime(trn_end_time)
    val_start_time = cudf.to_datetime(val_start_time)
    val_end_time = cudf.to_datetime(val_end_time)
    test_start_time = cudf.to_datetime(test_start_time)
    test_end_time = cudf.to_datetime(test_end_time)
    
    trn_transactions = transactions[(transactions.t_dat > trn_start_time) & (transactions.t_dat <= trn_end_time)]
    val_transactions = transactions[(transactions.t_dat > val_start_time) & (transactions.t_dat <= val_end_time)]
    test_transactions = transactions[(transactions.t_dat > test_start_time) & (transactions.t_dat <= test_end_time)]
    gt_df = construct_gt_df(test_transactions)
    
    trn_df = construct_test_df(val_transactions,trn_transactions,article_features,articles,customer_features,customers,how='left')
    pos_label = val_transactions[['article_id','customer_id']]
    pos_label['label'] = 1
    trn_df = trn_df.merge(pos_label,on=['article_id','customer_id'],how='left')
    trn_df['label'].fillna(0.,inplace=True)
    
    trn_df = trn_df.merge(trn_df.groupby('customer_id').size().to_frame().rename(columns={0:'group_size'}),on='customer_id')
    test_df = construct_test_df(test_transactions,val_transactions,article_features,articles,customer_features,customers,how='left')
    
    return trn_df.reset_index(drop=True),test_df.reset_index(drop=True),gt_df.reset_index(drop=True)

In [126]:
class LightGBMCollator(object):
    def __init__(self,dfs,features,label,k=12):
        self.dfs = dfs
        self.features = features
        self.label = label
        self.k = k
        
    def x_y_group(self,data,features,target,only_x=False,verbose=False):
        group = data.groupby('customer_id').size().to_frame('size')['size']
        data = data.sort_values('customer_id').reset_index()
        return data[features],data[target],group
        
    def get_train_dataset_by_index(self,index):
        trn_x,trn_y,trn_grp = self.x_y_group(self.dfs[index][0],features,label)
        return trn_x,trn_y,trn_grp
        
    def get_ground_truth_dataset_by_index(self,index):
        return self.dfs[index][-1]
    
    def get_test_dataset_by_index(self,index):
        return self.dfs[index][1].to_pandas()
    
    def construct_eval_dataset(self):
        self.gt_df = self.get_ground_truth_dataset_by_index(0)
        self.test_df = self.get_test_dataset_by_index(0)
    
    def evaluate_score(self,pred_df,gt_df,k=12,verbose=True,group_name='customer_id'):
        from metric import mapk
        eval_df = gt_df.merge(pred_df,on=group_name,how='left')
        score = mapk(eval_df['ground_truth'].tolist(),eval_df['prediction'].tolist())
        if verbose: print('map@'+str(k),score)
        return score
    
    def feval(self,preds,eval_dataset):
        pred_df = pd.DataFrame()
        pred_df['customer_id'] = self.test_df['customer_id']
        pred_df['article_id'] = self.test_df['article_id']
        pred_df['prediction'] = preds
        pred_df = pred_df.groupby('customer_id') \
                        .apply(lambda x: x.sort_values('prediction',ascending=False)['article_id'].tolist()[:self.k]) \
                        .reset_index()
        pred_df.columns = ['customer_id','prediction']
        score = self.evaluate_score(pred_df,self.gt_df,group_name='customer_id',verbose=False)
        return 'MAP@'+str(self.k), score, True

In [127]:
def x_y_group(data,features,target,only_x=False,verbose=False):
    group = data.groupby('customer_id').size().to_frame('size')['size']
    data = data.sort_values('customer_id').reset_index()
    return data[features],data[target],group

def make_prediction(model,test_df,features,label,k=12,group_name='customer_id'):
    test_x = test_df[features]
    test_pred = model.predict(test_x)
    test_x[group_name] = test_df[group_name]
    test_x['article_id'] = test_df['article_id']
    test_x['prediction'] = test_pred
    pred_df = test_x.groupby(group_name) \
                    .apply(lambda x: x.sort_values('prediction',ascending=False)['article_id'].tolist()[:k]) \
                    .reset_index()
    pred_df.columns = [group_name,'prediction']
    return pred_df

def evaluate_score(pred_df,gt_df,k=12,verbose=True,group_name='customer_id'):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on=group_name,how='left')
    score = mapk(eval_df['ground_truth'].tolist(),eval_df['prediction'].tolist())
    if verbose: print('map@'+str(k),score)
    return score

****Cross validation****

In [None]:
%%time

label = 'label'
scores = []

for i,(t1,t2,t3,t4) in enumerate([
        ('2020-01-01','2020-08-01','2020-09-15','2020-09-22'),
        #('2020-01-01','2020-07-24','2020-09-07','2020-09-15'),
        #('2020-01-01','2020-07-17','2020-09-01','2020-09-07'),
    ]):
    
    trn_df,test_df,gt_df = construct_dataset(
        transactions,
        articles,customers,
        trn_start_time=t1,trn_end_time=t2,
        val_start_time=t2,val_end_time=t3,
        test_start_time=t3,test_end_time=t4,
    )
    
    features = [c for c in trn_df.columns if c not in ['article_id','customer_id','label','index','group_size']]
    collator = LightGBMCollator([[trn_df,test_df,gt_df]],features,label)
    trn_x,trn_y,trn_grp = collator.get_train_dataset_by_index(0)
    collator.construct_eval_dataset()

    trn_dataset = lgb.Dataset(trn_x.to_pandas(),trn_y.to_pandas(),group=trn_grp.to_pandas())
    val_dataset = lgb.Dataset(collator.test_df[collator.features])
    param = dict(
        objective='lambdarank',
        metric='map@12',
        keep_training_booster=True,
        early_stopping_round=5,
        seed=0,
        learning_rate=0.1,
    )
    num_round = 1000
    bst = lgb.train(
        param,
        trn_dataset, 
        num_round,
        feval=collator.feval,
        valid_sets=[val_dataset],
    )
    
    pred_df = make_prediction(bst,test_df.to_pandas(),features,label)
    score = evaluate_score(pred_df,gt_df)
    scores.append(score)
print('score: ',np.mean(scores))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4652
[LightGBM] [Info] Number of data points in the train set: 19322727, number of used features: 20
[1]	valid_0's MAP@12: 0.019677
Training until validation scores don't improve for 5 rounds
[2]	valid_0's MAP@12: 0.0207268
[3]	valid_0's MAP@12: 0.0210634
[4]	valid_0's MAP@12: 0.0209037
[5]	valid_0's MAP@12: 0.0212307
[6]	valid_0's MAP@12: 0.0211498
[7]	valid_0's MAP@12: 0.0214056
[8]	valid_0's MAP@12: 0.021484
[9]	valid_0's MAP@12: 0.0214266
[10]	valid_0's MAP@12: 0.0214021
[11]	valid_0's MAP@12: 0.0215277
[12]	valid_0's MAP@12: 0.021463
[13]	valid_0's MAP@12: 0.0216083
[14]	valid_0's MAP@12: 0.0217508
[15]	valid_0's MAP@12: 0.0218504
[16]	valid_0's MAP@12: 0.0216948
[17]	valid_0's MAP@12: 0.0214625


****Training****

In [13]:
%%time
dfs = []
for i,(t1,t2,t3,t4) in enumerate([
        ('2020-01-01','2020-08-01','2020-09-15','2020-09-22'),
        ('2020-01-01','2020-08-01','2020-09-15','2020-09-22'),
    ]):
    trn_tmp,test_tmp,gt_tmp = construct_dataset(
        transactions,
        articles,customers,
        trn_start_time=t1,trn_end_time=t2,
        val_start_time=t2,val_end_time=t3,
        test_start_time=t3,test_end_time=t4,
    )
    dfs.append((trn_tmp,test_tmp,gt_tmp))

CPU times: user 13.6 s, sys: 6.05 s, total: 19.7 s
Wall time: 19.8 s


In [16]:
%%time
features = [c for c in dfs[0][0].columns if c not in ['article_id','customer_id','label','index','group_size']]
label = 'label'

collator = LightGBMCollator(dfs,features,label)
trn_x,trn_y,trn_grp = collator.get_train_dataset_by_index(0)
collator.construct_eval_dataset()

trn_dataset = lgb.Dataset(trn_x.to_pandas(),trn_y.to_pandas(),group=trn_grp.to_pandas())
val_dataset = lgb.Dataset(collator.test_df[collator.features])
param = dict(
    objective='lambdarank',
    metric='map@12',
    keep_training_booster=True,
    early_stopping_round=5,
    seed=0,
)
num_round = 1000
bst = lgb.train(
    param,
    trn_dataset, 
    num_round,
    feval=collator.feval,
    valid_sets=[val_dataset],
)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4718
[LightGBM] [Info] Number of data points in the train set: 7832995, number of used features: 20
[1]	valid_0's MAP@12: 0.0202667
Training until validation scores don't improve for 5 rounds
[2]	valid_0's MAP@12: 0.0214225
[3]	valid_0's MAP@12: 0.0214367
[4]	valid_0's MAP@12: 0.0215855
[5]	valid_0's MAP@12: 0.0218864
[6]	valid_0's MAP@12: 0.0217541
[7]	valid_0's MAP@12: 0.0214959
[8]	valid_0's MAP@12: 0.0218041
[9]	valid_0's MAP@12: 0.0219306
[10]	valid_0's MAP@12: 0.0218384
[11]	valid_0's MAP@12: 0.0221501
[12]	valid_0's MAP@12: 0.0218704
[13]	valid_0's MAP@12: 0.0218787
[14]	valid_0's MAP@12: 0.0219306
[15]	valid_0's MAP@12: 0.0219266
[16]	valid_0's MAP@12: 0.0218623
Early stopping, best iteration is:
[11]	valid_0's MAP@12: 0.0221501
CPU times: user 4min 9s, sys: 3.79 s, total: 4min 13s
Wall time: 3min 37s


In [11]:
import matplotlib.pyplot as plt

{k:v for k,v in zip(bst.feature_name(),bst.feature_importance('gain'))}

{'week': 0.0,
 'product_group_name_prob': 0.0,
 'product_type_name_prob': 2859.975051879883,
 'graphical_appearance_name_prob': 690.2089996337891,
 'perceived_colour_value_name_prob': 168.3333969116211,
 'colour_group_code_prob': 774.8486862182617,
 'index_name_prob': 2202.320114135742,
 'index_group_name_prob': 685.1419982910156,
 'section_name_prob': 1906.414535522461,
 'department_name_prob': 9431.715065002441,
 'FN_prob': 0.0,
 'Active_prob': 562.3703994750977,
 'club_member_status_prob': 8235.779876708984,
 'age_prob': 4028.970016479492,
 'fashion_news_frequency_prob': 0.0,
 'past_purchase_prob': 12402.088165283203,
 'total_purchase': 13010.16780090332,
 'time_elapsed_last_purchase': 14486.576126098633,
 'number_of_purchase': 1768.5429992675781,
 'time_elapsed_first_release': 2697.0369720458984}

****Local CV****

In [16]:
%%time
idx = 0
pred_df = make_prediction(bst,dfs[idx][1].to_pandas(),features,label,k=12)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


CPU times: user 14.8 s, sys: 574 ms, total: 15.4 s
Wall time: 14.6 s


In [17]:
evaluate_score(
    pred_df,
    dfs[idx][-1],
)

map@12 0.0223621690029079


0.0223621690029079

****Submission****

In [18]:
%%time
article_features=[
    'product_group_name', 'product_type_name', 
    'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
    'index_name', 'index_group_name', 
    'section_name', 'department_name',
]
customer_features=[
    'FN','Active','club_member_status','age','fashion_news_frequency',
    ]
submission_df = cudf.read_csv('storage/sample_submission.csv')
submission_df = construct_test_df(
    submission_df[['customer_id']],
    transactions[(transactions.t_dat > cudf.to_datetime('2020-09-07')) & (transactions.t_dat <= cudf.to_datetime('2020-09-22'))],
    article_features,articles,customer_features,customers,
    how='left',
)
submission_df = make_prediction(bst,submission_df.to_pandas(),features,label,k=12)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


CPU times: user 5min 8s, sys: 19.7 s, total: 5min 28s
Wall time: 5min 17s


In [19]:
submission_df['prediction'] = submission_df['prediction'].apply(lambda x: ' '.join(['0'+str(i) for i in x]))

In [20]:
os.makedirs(base_dir,exist_ok=True)
submission_df.to_csv(os.path.join(base_dir,'submission.csv'),index=False)

In [21]:
submission_df

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0865799006 0673677002 0850917001 0751471001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0865799006 0751471043 0924243001 0923758001 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0918292001 0751471043 0865799006 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243002 0863646001 0924243001 0923758001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243002 0863646001 0924243001 0923758001 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0865799006 0751471043 0924243001 0923758001 09...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0865799006 0751471043 0924243001 0923758001 09...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0762846027 0884081001 0673677002 0689365050 08...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0865799006 0706016001 0850917001 0918522001 09...
