In [2]:
import cudf
import lightgbm as lgb
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [3]:
%%time 
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

customers['FN'].fillna(0.,inplace=True)
customers['Active'].fillna(0.,inplace=True)
customers['club_member_status'].fillna('None',inplace=True)
customers['age'] = customers['age'] / 10
customers['age'] = customers['age'].astype(int)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].str.lower().fillna('none')

transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])

CPU times: user 1.2 s, sys: 1.79 s, total: 2.99 s
Wall time: 9.21 s


In [4]:
def past_purchase_feature(df,transactions):
    transactions['count'] = 1
    
    time_elapsed_last_purchase = transactions['t_dat'].max()-transactions[['customer_id','article_id','t_dat']].groupby(['customer_id','article_id'])['t_dat'].max()
    time_elapsed_last_purchase = time_elapsed_last_purchase.dt.days
    df = df.merge(time_elapsed_last_purchase,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'t_dat':'time_elapsed_last_purchase'})
    df['time_elapsed_last_purchase'].fillna(1e6,inplace=True)
    
    time_elapsed_first_release = transactions[['customer_id','article_id','t_dat']].groupby(['customer_id','article_id'])['t_dat'].min()-cudf.to_datetime('2018-09-01')
    time_elapsed_first_release = time_elapsed_first_release.dt.days
    df = df.merge(time_elapsed_first_release,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'t_dat':'time_elapsed_first_release'})
    df['time_elapsed_first_release'].fillna(1e6,inplace=True)
    
    past_purchase_prob = transactions[['customer_id','article_id','count']].groupby(['customer_id','article_id'])['count'].count().reset_index()
    norm = transactions[['customer_id','article_id']].groupby('customer_id').count().reset_index().rename(columns={'article_id':'norm'})
    past_purchase_prob = past_purchase_prob.merge(norm,on='customer_id')
    past_purchase_prob['count'] = past_purchase_prob['count'] / past_purchase_prob['norm']
    past_purchase_prob.drop(columns=['norm'],inplace=True)
    df = df.merge(past_purchase_prob,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'count':'past_purchase_prob'})
    df['past_purchase_prob'].fillna(0.,inplace=True)
    
    total_purchase = transactions[['article_id','count']].groupby('article_id')['count'].count().reset_index().rename(columns={'count':'total_purchase'})
    norm = transactions['count'].sum()
    total_purchase['total_purchase'] = total_purchase['total_purchase'] / norm
    df = df.merge(total_purchase,on='article_id',how='left')
    df['total_purchase'].fillna(0.,inplace=True)
    
    number_of_purchase = transactions[['customer_id','count']].groupby('customer_id')['count'].count().reset_index().rename(columns={'count':'number_of_purchase'})
    df = df.merge(number_of_purchase,on='customer_id',how='left')
    df['number_of_purchase'].fillna(0.,inplace=True)
    
    repeated_purchase = transactions[['customer_id','article_id','count']].groupby(['customer_id','article_id'])['count'].count().reset_index().rename(columns={'count':'repeated_purchase'})
    df = df.merge(repeated_purchase,on=['customer_id','article_id'],how='left')
    
    min_dat_purchase = transactions.groupby(['article_id'])['t_dat'].min()
    max_dat_purchase = transactions.groupby(['article_id'])['t_dat'].max()
    sale_duration = (max_dat_purchase - min_dat_purchase).to_frame().reset_index().rename(columns={'t_dat':'duration'})
    sale_duration['duration'] = sale_duration['duration'].dt.days
    sale_count = transactions.groupby(['article_id'])['t_dat'].count().to_frame().reset_index().rename(columns={'t_dat':'count'})
    sale_rate = sale_duration.merge(sale_count,on='article_id')
    sale_rate = sale_rate.loc[sale_rate['duration']!=0]
    sale_rate['sale_rate'] = sale_rate['count'] / sale_rate['duration']
    df = df.merge(sale_rate[['article_id','sale_rate']],on='article_id',how='left')

    return df
    
def article_feature_prob_vector(df,transactions,articles,article_features,postfix='_prob',customer_group_name='customer_id'):
    transactions['count'] = 1
    if customer_group_name != 'customer_id':
        df = df.merge(customers[['customer_id',customer_group_name]],on='customer_id',how='left')
        transactions = transactions.merge(customers[['customer_id',customer_group_name]],on='customer_id',how='left')
    for article_feature in article_features:
        transactions = transactions.merge(articles[['article_id',article_feature]],on='article_id',how='left')
        norm = transactions.groupby([customer_group_name])['count'].count().reset_index()
        norm.rename(columns={'count':'norm'},inplace=True)
        count = transactions.groupby([customer_group_name,article_feature])['count'].count().reset_index()
        count = count.merge(norm,on=customer_group_name)
        count['count'] = count['count'] / count['norm']
        count = count.rename(columns={'count':article_feature+postfix})
        count = count[[customer_group_name,article_feature,article_feature+postfix]]
        del(norm)
        df = df.merge(articles[['article_id',article_feature]],on='article_id',how='left')
        df = df.merge(count,on=[customer_group_name,article_feature],how='left')
    return df

def customer_feature_prob_vector(df,transactions,customers,customer_features,postfix='_prob'):
    transactions['count'] = 1
    for customer_feature in customer_features:
        transactions = transactions.merge(customers[['customer_id',customer_feature]],on='customer_id',how='left')
        norm = transactions.groupby(['article_id'])['count'].count().reset_index()
        norm.rename(columns={'count':'norm'},inplace=True)
        count = transactions.groupby(['article_id',customer_feature])['count'].count().reset_index()
        count = count.merge(norm,on='article_id')
        count['count'] = count['count'] / count['norm']
        count = count.rename(columns={'count':customer_feature+postfix})
        count = count[['article_id',customer_feature,customer_feature+postfix]]
        del(norm)
        df = df.merge(customers[['customer_id',customer_feature]],on='customer_id',how='left')
        df = df.merge(count,on=['article_id',customer_feature],how='left')
    return df

def construct_feature_df(
        df,transactions,
        article_features,
        articles,
        customer_features,
        customers,
        general_features=['article_id','customer_id'],
    ):
    df = article_feature_prob_vector(df,transactions,articles,article_features)
    df = customer_feature_prob_vector(df,transactions,customers,customer_features)
    df = past_purchase_feature(df,transactions)
    df = df[
            general_features+[f for f in df.columns if '_prob' in f] + 
            ['total_purchase','time_elapsed_last_purchase','past_purchase_prob','number_of_purchase','time_elapsed_first_release','repeated_purchase']
        ]
    return df

def construct_candidate_dict(transactions_3w):
    purchase_dict_3w = {}
    for i,x in enumerate(zip(transactions_3w['customer_id'], transactions_3w['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict_3w:
            purchase_dict_3w[cust_id] = {}
        if art_id not in purchase_dict_3w[cust_id]:
            purchase_dict_3w[cust_id][art_id] = 0
        purchase_dict_3w[cust_id][art_id] += 1
    return purchase_dict_3w

def construct_candidate_df(
        test_df,transactions,
        nweek=8,
        n_popular_item=50,
        n_total_item=None,
    ):
    
    recent_transactions = {}
    purchase_dict = {}
    for i in range(1,nweek+1):
        recent_transactions[i] = transactions[(transactions.t_dat>transactions.t_dat.max()-i*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-(i-1)*pd.Timedelta(7,unit='day'))].to_pandas()
        purchase_dict[i] = construct_candidate_dict(recent_transactions[i])
    
    if 1 in recent_transactions:
        most_popular_items_1w_all = list((recent_transactions[1]['article_id'].value_counts()).index)[:n_popular_item]
    else:
        most_popular_items_1w_all = list(transactions['article_id'].value_counts().index.to_arrow().to_pylist())[:n_popular_item]
    
    pred_df = pd.DataFrame()
    pred_df['customer_id'] = test_df['customer_id'].unique()
    
    prediction_list = []
    
    for i, cust_id in enumerate(pred_df['customer_id']):
        s = []
        total_purchase_dict = {}
        
        for i,purchase_dict_week in purchase_dict.items():
            if cust_id in purchase_dict_week:
                l = sorted((purchase_dict_week[cust_id]).items(), key=lambda x: x[1], reverse=True)
                l = [y[0] for y in l]
                for aid in l:
                    if aid not in total_purchase_dict:
                        total_purchase_dict[aid] = 1
                    else:
                        total_purchase_dict[aid] += 1

        for aid in most_popular_items_1w_all[:n_popular_item]:
            if aid not in total_purchase_dict:
                total_purchase_dict[aid] = 1
            else:
                total_purchase_dict[aid] += 1

        if n_total_item is not None:
            total_purchase_dict = {k: v for k, v in sorted(total_purchase_dict.items(), key=lambda item: item[1], reverse=True)}
            s = list(total_purchase_dict.keys())[:n_total_item]
        else:
            s = list(total_purchase_dict.keys())
        
        prediction_list.append(s)
        
    pred_df['article_id'] = prediction_list
    
    return pred_df

def construct_test_df(test_df,transactions,article_features,articles,customer_features,customers,how='outer',n_popular_item=10):
    test_df = construct_candidate_df(test_df,transactions,n_popular_item=n_popular_item).explode(['article_id']).reset_index(drop=True)
    test_df = cudf.from_pandas(test_df)
    test_df = construct_feature_df(test_df,transactions,article_features,articles,customer_features,customers,general_features=['article_id','customer_id'])
    test_df = test_df.fillna(0.)
    test_df['article_id'] = test_df['article_id'].astype(int)
    test_df = test_df.sort_values(['customer_id','article_id']).reset_index(drop=True)
    return test_df

def construct_recent_purchase_df(
        test_df,transactions,
        nweek=3,feature_name='recent_purchase',
    ):
    
    recent_transactions = {}
    purchase_dict = {}
    for i in range(1,nweek+1):
        recent_transactions[i] = transactions[(transactions.t_dat>transactions.t_dat.max()-i*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-(i-1)*pd.Timedelta(7,unit='day'))]
        purchase_dict[i] = construct_candidate_dict(recent_transactions[i])

    most_popular_items_1w = list((recent_transactions[1]['article_id'].value_counts()).index)[:12]
        
    pred_df = test_df[['customer_id']]
    article_id_list,recent_purchase_score_list = [],[]
    
    for i, cust_id in enumerate(
        test_df['customer_id'].values.reshape((-1,))
    ):
        s = []
        total_purchase_dict = {}
        
        for i,purchase_dict_week in purchase_dict.items():
            if cust_id in purchase_dict_week:
                l = sorted((purchase_dict_week[cust_id]).items(), key=lambda x: x[1], reverse=True)
                l = [y[0] for y in l]
                for aid in l:
                    if aid not in total_purchase_dict:
                        total_purchase_dict[aid] = 1
                    else:
                        total_purchase_dict[aid] += 1
                        
#         for aid in most_popular_items_1w[:12]:
#             if aid not in total_purchase_dict:
#                 total_purchase_dict[aid] = 1
#             else:
#                 total_purchase_dict[aid] += 1

        total_purchase_dict = {k: v for k, v in sorted(total_purchase_dict.items(), key=lambda item: item[1], reverse=True)}
        article_ids = list(total_purchase_dict.keys())
        recent_purchase_scores = list(total_purchase_dict.values())
        
        article_id_list.append(article_ids)
        recent_purchase_score_list.append(recent_purchase_scores)
        
    pred_df['recent_purchase_article_id'] = article_id_list
    pred_df['recent_purchase_score'] = recent_purchase_score_list
    
    return pred_df

def construct_time_period_purchase_df(
        test_df,recent_transactions,
        feature_name,
        nitem=50,
    ):
    
    purchase_dict = construct_candidate_dict(recent_transactions)

    pred_df = test_df[['customer_id']]
    article_id_list,score_list = [],[]
    
    for i, cust_id in enumerate(
        test_df['customer_id'].values.reshape((-1,))
    ):
        s = []
        total_purchase_dict = {}

        if cust_id in purchase_dict:
            l = sorted((purchase_dict[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            for aid in l:
                if aid not in total_purchase_dict:
                    total_purchase_dict[aid] = 1
                else:
                    total_purchase_dict[aid] += 1

        total_purchase_dict = {k: v for k, v in sorted(total_purchase_dict.items(), key=lambda item: item[1], reverse=True)}
        article_ids = list(total_purchase_dict.keys())[:nitem]
        scores = list(total_purchase_dict.values())[:nitem]
        
        article_id_list.append(article_ids)
        score_list.append(scores)
        
    pred_df[feature_name+'_article_id'] = article_id_list
    pred_df[feature_name+'_score'] = score_list
    
    return pred_df

def construct_popular_purchase_df(
        test_df,transactions,
        feature_name='popular_purchase',
        nitem=200,
    ):
        
    most_popular_items = list((transactions['article_id'].value_counts()).index)[:nitem] 
    most_popular_scores = list((transactions['article_id'].value_counts()))[:nitem]

    pred_df = test_df[['customer_id']]
        
    pred_df[feature_name+'_article_id'] = [most_popular_items]*len(pred_df)
    pred_df[feature_name+'_score'] = [most_popular_scores]*len(pred_df)
    
    return pred_df

def construct_gt_df(test_transactions):
    gt_df = test_transactions.to_pandas().groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

def make_prediction(model,test_df,features,label,k=100,group_name='customer_id'):
    test_x = test_df[features]
    test_pred = model.predict(test_x)
    test_x[group_name] = test_df[group_name]
    test_x['past_purchase_article_id'] = test_df['article_id']
    test_x['past_purchase_score'] = test_pred
    pred_df = test_x.groupby('customer_id') \
                .apply(lambda x: x.sort_values('past_purchase_score',ascending=False)['past_purchase_article_id'].tolist()) \
                .reset_index()
    pred_df.columns = ['customer_id','past_purchase_article_id']
    past_purchase_score = test_x.groupby('customer_id') \
            .apply(lambda x: x.sort_values('past_purchase_score',ascending=False)['past_purchase_score'].tolist()).reset_index()
    past_purchase_score.columns = ['customer_id','past_purchase_score']
    pred_df = pred_df.merge(past_purchase_score,on='customer_id')
    return pred_df[['customer_id','past_purchase_article_id','past_purchase_score']]
 

In [5]:
def construct_rerank_df(
    customer_df,past_transactions,
    articles,customers,
    past_purchase_model_path,
    label='label',
    gt_df=None,
    verbose=True,
    article_features=[
        'product_type_name','product_group_name',
        'graphical_appearance_name','colour_group_name',
        'perceived_colour_value_name','perceived_colour_master_name',
        'department_name', 'index_name',
        'index_group_name','section_name',
        'garment_group_name',
    ],
    customer_features=[
        'FN','Active','club_member_status','age','fashion_news_frequency',
    ],
):
    past_purchase_df = construct_test_df(
        customer_df,past_transactions,
        article_features,articles,
        customer_features,customers,
        how='outer',n_popular_item=90
    )    
    bst = lgb.Booster(model_file=past_purchase_model_path)
    features = [c for c in past_purchase_df.columns if c not in ['article_id','customer_id','label','index','group_size']]
    past_purchase_df = make_prediction(bst,past_purchase_df.to_pandas(),features,label)
    rerank_df = past_purchase_df.explode(['past_purchase_article_id','past_purchase_score']).rename(columns={'past_purchase_article_id':'article_id'})

    for iweek in range(1,4):
        feature_name = 'recent_purchase_{:d}w'.format(iweek)
        recent_transactions = past_transactions[
            (past_transactions.t_dat>past_transactions.t_dat.max()-iweek*pd.Timedelta(7,unit='day'))&
            (past_transactions.t_dat<=past_transactions.t_dat.max()-(iweek-1)*pd.Timedelta(7,unit='day'))
        ]
        recent_purchase_df = construct_time_period_purchase_df(
           customer_df,recent_transactions.to_pandas(),feature_name,
        )
        recent_purchase_df = recent_purchase_df.explode([feature_name+'_article_id',feature_name+'_score']).rename(columns={feature_name+'_article_id':'article_id'})
        rerank_df = rerank_df.merge(recent_purchase_df,on=['customer_id','article_id'],how='outer')
    
    recent_purchase_df = construct_recent_purchase_df(
       customer_df,past_transactions.to_pandas(),
    )
    recent_purchase_df = recent_purchase_df.explode(['recent_purchase_article_id','recent_purchase_score']).rename(columns={'recent_purchase_article_id':'article_id'})
    rerank_df = rerank_df.merge(recent_purchase_df,on=['customer_id','article_id'],how='outer')
    
    for iweek in range(1,4):
        feature_name = 'popular_purchase_{:d}w'.format(iweek)
        recent_transactions = past_transactions[
            (past_transactions.t_dat>past_transactions.t_dat.max()-iweek*pd.Timedelta(7,unit='day'))&
            (past_transactions.t_dat<=past_transactions.t_dat.max()-(iweek-1)*pd.Timedelta(7,unit='day'))
        ]
        popular_purchase_df = construct_popular_purchase_df(
            customer_df,recent_transactions.to_pandas(),feature_name,
        )
        popular_purchase_df = popular_purchase_df.explode([feature_name+'_article_id',feature_name+'_score']).rename(columns={feature_name+'_article_id':'article_id'})
        rerank_df = rerank_df.merge(popular_purchase_df,on=['customer_id','article_id'],how='outer')
        
    feature_name = 'popular_purchase_123w'
    recent_transactions = past_transactions[
        (past_transactions.t_dat>past_transactions.t_dat.max()-4*pd.Timedelta(7,unit='day'))&
        (past_transactions.t_dat<=past_transactions.t_dat.max()-pd.Timedelta(7,unit='day'))
    ]
    popular_purchase_df = construct_popular_purchase_df(
        customer_df,recent_transactions.to_pandas(),feature_name,
    )
    popular_purchase_df = popular_purchase_df.explode([feature_name+'_article_id',feature_name+'_score']).rename(columns={feature_name+'_article_id':'article_id'})
    rerank_df = rerank_df.merge(popular_purchase_df,on=['customer_id','article_id'],how='outer')
    
    if gt_df is not None:
        tmp_gt_df = gt_df.copy()
        tmp_gt_df['label'] = 1
        tmp_gt_df = tmp_gt_df.rename(columns={'ground_truth':'article_id'})
        rerank_df = rerank_df.merge(tmp_gt_df[['customer_id','article_id','label']].explode('article_id'),on=['customer_id','article_id'],how='outer')
    
    rerank_df['past_purchase_score'].fillna(rerank_df['past_purchase_score'].min()-1.,inplace=True)
    rerank_df.fillna(0.,inplace=True)
    
    return rerank_df

def evaluate_score(pred_df,gt_df,k=12,verbose=True,group_name='customer_id',pred_name='prediction'):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on=group_name,how='left')
    score = mapk(eval_df['ground_truth'].tolist(),eval_df[pred_name].tolist())
    if verbose: print('map@'+str(k),score)
    return score

def construct_sub_df(test_df,preds,k=12):
    pred_df = pd.DataFrame()
    pred_df['customer_id'] = test_df['customer_id']
    pred_df['article_id'] = test_df['article_id']
    pred_df['prediction'] = preds
    pred_df = pred_df.groupby('customer_id') \
                    .apply(lambda x: x.sort_values(['customer_id','prediction'],ascending=False)['article_id'].tolist()[:k]) \
                    .reset_index()
    pred_df.columns = ['customer_id','prediction']
    return pred_df

def feval(preds,test_df,test_gt_df,k=12):
    pred_df = pd.DataFrame()
    pred_df['customer_id'] = test_df['customer_id']
    pred_df['article_id'] = test_df['article_id']
    pred_df['prediction'] = preds
    pred_df = pred_df.groupby('customer_id') \
                    .apply(lambda x: x.sort_values(['customer_id','prediction'],ascending=False)['article_id'].tolist()[:k]) \
                    .reset_index()
    pred_df.columns = ['customer_id','prediction']
    score = evaluate_score(pred_df,test_gt_df,group_name='customer_id',verbose=False)
    return 'MAP@'+str(k), score, True

In [6]:
trn_start_time = '2020-09-07'
trn_end_time = '2020-09-15'
test_start_time = '2020-09-15'
test_end_time = '2020-09-22'

trn_start_time = cudf.to_datetime(trn_start_time)
trn_end_time = cudf.to_datetime(trn_end_time)
test_start_time = cudf.to_datetime(test_start_time)
test_end_time = cudf.to_datetime(test_end_time)

past_transactions = transactions[(transactions.t_dat > cudf.to_datetime('2020-01-01')) & (transactions.t_dat <= trn_end_time)]
trn_transactions = transactions[(transactions.t_dat > trn_start_time) & (transactions.t_dat <= trn_end_time)]
test_transactions = transactions[(transactions.t_dat > test_start_time) & (transactions.t_dat <= test_end_time)]

trn_gt_df = construct_gt_df(trn_transactions)
test_gt_df = construct_gt_df(test_transactions)

In [7]:
%%time
trn_df = construct_rerank_df(
    trn_gt_df[['customer_id']],past_transactions,
    articles,customers,
    'storage/output/220325_lightgbm_training/220325_dataset_2020-06-01_2020-08-01_2020-09-15_2020-09-22_score:0.0230037_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
    label='label',
    gt_df=trn_gt_df,
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


CPU times: user 3min 5s, sys: 35.3 s, total: 3min 40s
Wall time: 3min 41s


In [8]:
%%time
test_df = construct_rerank_df(
    test_gt_df[['customer_id']],past_transactions,
    articles,customers,
    'storage/output/220325_lightgbm_training/220325_dataset_2020-06-01_2020-08-01_2020-09-15_2020-09-22_score:0.0230037_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
    label='label',
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


CPU times: user 2min 32s, sys: 30.8 s, total: 3min 3s
Wall time: 2min 57s


****Reranking with Logistic Regression****

In [9]:
from pprint import pprint
pprint(trn_df.columns.tolist())

['customer_id',
 'article_id',
 'past_purchase_score',
 'recent_purchase_1w_score',
 'recent_purchase_2w_score',
 'recent_purchase_3w_score',
 'recent_purchase_score',
 'popular_purchase_1w_score',
 'popular_purchase_2w_score',
 'popular_purchase_3w_score',
 'popular_purchase_123w_score',
 'label']


In [10]:
features = [
    'past_purchase_score',
    'recent_purchase_1w_score',
    'recent_purchase_2w_score',
    'recent_purchase_3w_score',
    #'recent_purchase_score',
    'popular_purchase_1w_score',
     'popular_purchase_2w_score',
     'popular_purchase_3w_score',
#     'popular_purchase_4w_score',
#     'popular_purchase_5w_score',
#     'popular_purchase_6w_score',
#     'popular_purchase_7w_score',
#     'popular_purchase_8w_score',
#     'popular_purchase_9w_score',
#     'popular_purchase_10w_score',
#     'popular_purchase_11w_score',
#     'popular_purchase_123w_score',
]
target = 'label'
print(' '.join(features))

past_purchase_score recent_purchase_1w_score recent_purchase_2w_score recent_purchase_3w_score popular_purchase_1w_score popular_purchase_2w_score popular_purchase_3w_score


In [11]:
trn_df[features]

Unnamed: 0,past_purchase_score,recent_purchase_1w_score,recent_purchase_2w_score,recent_purchase_3w_score,popular_purchase_1w_score,popular_purchase_2w_score,popular_purchase_3w_score
0,0.232283,1.0,0.0,0.0,0.0,0.0,0.0
1,0.035045,0.0,0.0,0.0,266.0,377.0,407.0
2,0.029059,0.0,0.0,0.0,607.0,724.0,846.0
3,0.023665,0.0,0.0,0.0,453.0,683.0,701.0
4,0.020612,0.0,0.0,0.0,609.0,529.0,297.0
...,...,...,...,...,...,...,...
30374491,-1.386569,0.0,0.0,0.0,0.0,0.0,0.0
30374492,-1.386569,0.0,0.0,0.0,0.0,0.0,0.0
30374493,-1.386569,0.0,0.0,0.0,0.0,0.0,0.0
30374494,-1.386569,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.linear_model import LogisticRegression

In [15]:
%%time
clf = LogisticRegression(max_iter=1000,random_state=1).fit(trn_df[features],trn_df[target])
print(clf.coef_, clf.intercept_)

[[ 1.10032804e+01  1.93351856e+01  1.07022535e+01 -1.16369597e+00
   1.91058088e-03 -5.22323259e-04 -8.89565558e-04]] [-12.10194635]
CPU times: user 13min 6s, sys: 4min 51s, total: 17min 57s
Wall time: 11min 5s


In [16]:
%%time
preds = clf.predict_proba(test_df[features])
_,score,_ = feval(preds[:,1],test_df,test_gt_df,k=12)
print(score)

0.02409009606820875
CPU times: user 52.9 s, sys: 3.08 s, total: 56 s
Wall time: 55.5 s


****Construct ground truth vs prediction dataframe for model evalution****

In [17]:
eval_df = construct_sub_df(test_df,preds[:,1],k=12)

In [18]:
eval_df = eval_df.merge(test_gt_df,on='customer_id')

In [19]:
eval_df['gt_pred_overlap'] = eval_df.apply(lambda x: set(x['prediction']).intersection(x['ground_truth']),axis=1)

In [20]:
eval_df['n_gt_pred_overlap'] = eval_df['gt_pred_overlap'].apply(len)

In [22]:
eval_df.to_csv('eval_df.csv',index=False)

****Cross validation****

In [33]:
%%time
features = [
    'past_purchase_score',
    #'recent_purchase_1w_score',
    #'recent_purchase_2w_score',
    #'recent_purchase_3w_score',
    'recent_purchase_score',
    #'popular_purchase_1w_score',
    'popular_purchase_2w_score',
    'popular_purchase_3w_score',
    #'popular_purchase_123w_score',
]
target = 'label'
print(' '.join(features))

for i,(trn_start_time,trn_end_time,test_start_time,test_end_time,model_path) in enumerate([
        (
            '2020-08-24','2020-08-31','2020-08-31','2020-09-07',
            'storage/output/220325_lightgbm_training/220325_dataset_2020-05-17_2020-07-17_2020-09-01_2020-09-07_score:0.022701_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
        ),
        (
            '2020-08-31','2020-09-07','2020-09-07','2020-09-15',
            'storage/output/220325_lightgbm_training/220325_dataset_2020-05-24_2020-07-24_2020-09-07_2020-09-15_score:0.020357_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
        ),
        (
            '2020-09-07','2020-09-15','2020-09-15','2020-09-22',
            'storage/output/220325_lightgbm_training/220325_dataset_2020-06-01_2020-08-01_2020-09-15_2020-09-22_score:0.0230037_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin',
        ),
    ]):
    
    print('-'*100)
    
    trn_start_time = cudf.to_datetime(trn_start_time)
    trn_end_time = cudf.to_datetime(trn_end_time)
    test_start_time = cudf.to_datetime(test_start_time)
    test_end_time = cudf.to_datetime(test_end_time)

    past_transactions = transactions[(transactions.t_dat > cudf.to_datetime('2020-01-01')) & (transactions.t_dat <= trn_end_time)]
    trn_transactions = transactions[(transactions.t_dat > trn_start_time) & (transactions.t_dat <= trn_end_time)]
    test_transactions = transactions[(transactions.t_dat > test_start_time) & (transactions.t_dat <= test_end_time)]

    print('Construct trn_df,test_df,gt_df')
    
    trn_gt_df = construct_gt_df(trn_transactions)
    test_gt_df = construct_gt_df(test_transactions)
    
    trn_df = construct_rerank_df(
        trn_gt_df[['customer_id']],past_transactions,
        articles,customers,
        model_path,
        label='label',
        gt_df=trn_gt_df,
    )
    
    test_df = construct_rerank_df(
        test_gt_df[['customer_id']],past_transactions,
        articles,customers,
        model_path,
        label='label',
    )
    
    print('Evaluate individual score')
    pred_df = test_df.groupby('customer_id') \
            .apply(lambda x: x.sort_values(['customer_id','past_purchase_score'],ascending=False)['article_id'].tolist()) \
            .reset_index()
    pred_df.columns = ['customer_id','prediction']
    past_purchase_score = evaluate_score(pred_df,test_gt_df)
    print('Past purchase score: ',past_purchase_score)
    
    pred_df = test_df.groupby('customer_id') \
            .apply(lambda x: x.sort_values(['customer_id','recent_purchase_score'],ascending=False)['article_id'].tolist()) \
            .reset_index()
    pred_df.columns = ['customer_id','prediction']
    recent_purchase_score = evaluate_score(pred_df,test_gt_df)
    print('Recent purchase score: ',recent_purchase_score)
    
    print('Fit logistic regression')
    clf = LogisticRegression(random_state=1).fit(trn_df[features],trn_df[target])
    print(clf.coef_, clf.intercept_)
    
    print('Evaluate score')
    preds = clf.predict_proba(test_df[features])
    _,score,_ = feval(preds[:,1],test_df,test_gt_df,k=12)
    print('Combined score',score)
    

past_purchase_score recent_purchase_score popular_purchase_2w_score popular_purchase_3w_score
----------------------------------------------------------------------------------------------------
Construct trn_df,test_df,gt_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


Evaluate individual score
map@12 0.0218197403088478
Past purchase score:  0.0218197403088478
map@12 0.0224274351400841
Rast purchase score:  0.0224274351400841
Fit logistic regression
[[ 5.35871593e+01  3.33788170e+00 -4.01850858e-03 -1.45003312e-03]] [-7.68883237]
Evaluate score
Combined score 0.02300798395803935
----------------------------------------------------------------------------------------------------
Construct trn_df,test_df,gt_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


Evaluate individual score
map@12 0.02120180749419716
Past purchase score:  0.02120180749419716
map@12 0.02081457478481509
Rast purchase score:  0.02081457478481509
Fit logistic regression
[[ 6.74606785e+01  9.70919996e-01 -1.45067508e-03 -4.34159804e-04]] [-9.70567644]
Evaluate score
Combined score 0.02118735517461465
----------------------------------------------------------------------------------------------------
Construct trn_df,test_df,gt_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


Evaluate individual score
map@12 0.022834035692751553
Past purchase score:  0.022834035692751553
map@12 0.023001215145791547
Rast purchase score:  0.023001215145791547
Fit logistic regression
[[ 3.96098123e+01  3.46879209e+00 -3.65995529e-03 -4.95104468e-04]] [-7.192397]
Evaluate score
Combined score 0.024440467490615764
CPU times: user 21min, sys: 5min 47s, total: 26min 47s
Wall time: 22min 24s


****Submission****

In [5]:
%%time
features = [
    'past_purchase_score',
    #'recent_purchase_1w_score',
    #'recent_purchase_2w_score',
    #'recent_purchase_3w_score',
    'recent_purchase_score',
    #'popular_purchase_1w_score',
    'popular_purchase_2w_score',
    'popular_purchase_3w_score',
    #'popular_purchase_123w_score',
]
target = 'label'
print(' '.join(features))

trn_start_time = '2020-09-15'
trn_end_time = '2020-09-22'
model_path = 'storage/output/220325_lightgbm_training/220325_dataset_2020-06-01_2020-08-01_2020-09-15_2020-09-22_score:0.0230037_objective:lambdarank_metric:map@12_boosting:dart_seed:0_learning_rate:0.03_num_threads:8_num_iterations:15_early_stopping_round:None.bin'
    
print('-'*100)

trn_start_time = cudf.to_datetime(trn_start_time)
trn_end_time = cudf.to_datetime(trn_end_time)

past_transactions = transactions[(transactions.t_dat > cudf.to_datetime('2020-01-01')) & (transactions.t_dat <= trn_end_time)]
trn_transactions = transactions[(transactions.t_dat > trn_start_time) & (transactions.t_dat <= trn_end_time)]

print('Construct trn_df')

trn_gt_df = construct_gt_df(trn_transactions)

trn_df = construct_rerank_df(
    trn_gt_df[['customer_id']],past_transactions,
    articles,customers,
    model_path,
    label='label',
    gt_df=trn_gt_df,
)

from sklearn.linear_model import LogisticRegression
print('Fit logistic regression')
clf = LogisticRegression(random_state=1).fit(trn_df[features],trn_df[target])
print(clf.coef_, clf.intercept_)

sub_df_reader = pd.read_csv('storage/sample_submission.csv',chunksize=int(2e5))
final_sub_df = None
for sub_df in sub_df_reader:
    test_df = construct_rerank_df(
        sub_df,past_transactions,
        articles,customers,
        model_path,
        label='label',
    )
    preds = clf.predict_proba(test_df[features])
    sub_df = construct_sub_df(test_df,preds[:,1],k=12)
    if final_sub_df is None:
        final_sub_df = sub_df
    else:
        final_sub_df = pd.concat([final_sub_df,sub_df])
final_sub_df
    

past_purchase_score recent_purchase_score popular_purchase_2w_score popular_purchase_3w_score
----------------------------------------------------------------------------------------------------
Construct trn_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]


Fit logistic regression
[[ 3.92763092e+01  2.63342250e+00 -2.88394032e-03 -2.33594154e-03]] [-7.3545712]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x[group_name] = test_df[group_name]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

CPU times: user 41min 23s, sys: 6min 20s, total: 47min 43s
Wall time: 45min 1s


Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[568601043.0, 783346001.0, 915529003.0, 924243..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[714790020.0, 866731001.0, 372860001.0, 706016..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[794321007.0, 915529005.0, 714790020.0, 783346..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[714790020.0, 573085043.0, 751471001.0, 783346..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[866731001.0, 915529003.0, 783346001.0, 924243..."
...,...,...
171975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,"[557599022.0, 720125039.0, 713997002.0, 791587..."
171976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,"[714790020.0, 762846027.0, 866731001.0, 706016..."
171977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,"[762846027.0, 794819001.0, 689365050.0, 884081..."
171978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,"[714790020.0, 372860002.0, 448509014.0, 788575..."


In [6]:
final_sub_df.to_csv('submission_df.csv',index=False)

****Reranking with LightGBM****

In [152]:
def x_y_group(data,features,target,only_x=False,verbose=False):
    data = data.sort_values('customer_id').reset_index()
    group = data.groupby('customer_id').size().to_frame('size')['size']
    return data[features],data[target],group

In [182]:
trn_x,trn_y,trn_grp = x_y_group(trn_df,features,target)
trn_dataset = lgb.Dataset(trn_x,label=trn_y,group=trn_grp)

test_dataset = lgb.Dataset(test_df[features])

In [167]:
%%time
param = dict(
    objective='lambdarank',
    metric='map@12',
    boosting='dart',
    num_round = 1,
    seed=0,
    learning_rate=0.1,
    num_threads=8,
)
bst = lgb.train(
    param,
    trn_dataset,
    feval=feval,
    valid_sets=[test_dataset],
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 259
[LightGBM] [Info] Number of data points in the train set: 7835273, number of used features: 2
[1]	valid_0's MAP@12: 0.0206276
CPU times: user 15.8 s, sys: 339 ms, total: 16.2 s
Wall time: 14.6 s
