In [28]:
import cudf
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_columns', None)

In [29]:
%%time 
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

CPU times: user 772 ms, sys: 1.09 s, total: 1.86 s
Wall time: 2.18 s


In [30]:
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])
customers['FN'].fillna(0.,inplace=True)
customers['Active'].fillna(0.,inplace=True)
customers['club_member_status'].fillna('None',inplace=True)
customers['age'] = customers['age'] / 10
customers['age'] = customers['age'].astype(int)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].str.lower().fillna('none')

In [31]:
transactions = transactions.merge(customers[['customer_id','postal_code']],on='customer_id',how='left')

In [32]:
def construct_candidate_dict(transactions,key='customer_id'):
    purchase_dict = {}
    for i,x in enumerate(zip(transactions[key], transactions['article_id'])):
        key_id, art_id = x
        if key_id not in purchase_dict:
            purchase_dict[key_id] = {}
        if art_id not in purchase_dict[key_id]:
            purchase_dict[key_id][art_id] = 0
        purchase_dict[key_id][art_id] += 1
    dummy_list = list((transactions['article_id'].value_counts()).index)[:12]
    return purchase_dict,dummy_list

def construct_pred_df(test_df,transactions,add_random_samples=False):
    
    bool_1w = transactions.t_dat>transactions.t_dat.max()-pd.Timedelta(7,unit='day')
    bool_2w = (transactions.t_dat>transactions.t_dat.max()-2*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-pd.Timedelta(7,unit='day'))
    bool_3w = (transactions.t_dat>transactions.t_dat.max()-3*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-2*pd.Timedelta(7,unit='day'))
    bool_4w = (transactions.t_dat>transactions.t_dat.max()-4*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-3*pd.Timedelta(7,unit='day'))
    bool_5w = (transactions.t_dat>transactions.t_dat.max()-5*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-4*pd.Timedelta(7,unit='day'))
    bool_6w = (transactions.t_dat>transactions.t_dat.max()-6*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-5*pd.Timedelta(7,unit='day'))
    bool_7w = (transactions.t_dat>transactions.t_dat.max()-7*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-6*pd.Timedelta(7,unit='day'))
    
    transactions_7w = transactions[bool_7w]
    transactions_6w = transactions[bool_6w]
    transactions_5w = transactions[bool_5w]
    transactions_4w = transactions[bool_4w]
    transactions_3w = transactions[bool_3w]
    transactions_2w = transactions[bool_2w]
    transactions_1w = transactions[bool_1w]
    
    purchase_dict_1w,dummy_list_1w = construct_candidate_dict(transactions_1w)
    purchase_dict_2w,_ = construct_candidate_dict(transactions_2w)
    purchase_dict_3w,_ = construct_candidate_dict(transactions_3w)
    purchase_dict_4w,_ = construct_candidate_dict(transactions_4w)
    purchase_dict_5w,_ = construct_candidate_dict(transactions_5w)
    purchase_dict_6w,_ = construct_candidate_dict(transactions_6w)
    purchase_dict_7w,_ = construct_candidate_dict(transactions_7w)
    
    pred_df = test_df[['customer_id']]
    prediction_list = []
    
    for i, cust_id in enumerate(
            test_df['customer_id'].values.reshape((-1,)),
    ):
        s = []
        if cust_id in purchase_dict_1w:
            l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
        if cust_id in purchase_dict_2w:
            l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
        if cust_id in purchase_dict_3w:
            l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_4w:
            l = sorted((purchase_dict_4w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_5w:
            l = sorted((purchase_dict_5w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_6w:
            l = sorted((purchase_dict_6w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_7w:
            l = sorted((purchase_dict_7w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
        
        #s += new_items
        s += dummy_list_1w
        s = s[:12]
        
        prediction_list.append(s)
    pred_df['prediction'] = prediction_list
    return pred_df

def construct_gt_df(test_transactions):
    gt_df = test_transactions.to_pandas().groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

def evaluate_score(pred_df,gt_df,k=12,verbose=True,group_name='customer_id'):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on=group_name,how='left')
    score = mapk(eval_df['ground_truth'].tolist(),eval_df['prediction'].tolist())
    if verbose: print('map@'+str(k),score)
    return score

In [34]:
scores = []
for i,(trn_start_time,trn_end_time,test_start_time,test_end_time) in enumerate([
        ('2020-06-01','2020-09-01','2020-09-01','2020-09-07'),
        ('2020-06-01','2020-09-07','2020-09-07','2020-09-15'),
        ('2020-06-01','2020-09-15','2020-09-15','2020-09-22'),
    ]):
    trn_transactions = transactions[(transactions.t_dat >= cudf.to_datetime(trn_start_time)) & (transactions.t_dat < cudf.to_datetime(trn_end_time))]
    test_transactions = transactions[(transactions.t_dat >= cudf.to_datetime(test_start_time)) & (transactions.t_dat < cudf.to_datetime(test_end_time))]
    gt_df = construct_gt_df(test_transactions)
    test_df = pd.DataFrame()
    test_df['customer_id'] = test_transactions['customer_id'].unique().to_arrow().to_pylist()
    test_df = test_df.merge(customers[['customer_id','postal_code']].to_pandas(),on='customer_id',how='left')
    pred_df = construct_pred_df(test_df,trn_transactions.to_pandas())
    score = evaluate_score(
        pred_df,
        gt_df,
    )
    scores.append(score)
print('score: ',np.mean(scores))

map@12 0.022679230277484523
map@12 0.01929862909143858
map@12 0.024192256497146344
score:  0.022056705288689818


****Submission****

In [166]:
out_dir = 'storage/output/220320_candidate_generation/'

In [167]:
submission_df = pd.read_csv('storage/sample_submission.csv')
pred_df = construct_pred_df(submission_df,transactions.to_pandas())

In [168]:
pred_df['prediction'] = pred_df['prediction'].apply(lambda x: ' '.join(['0'+str(i) for i in x]))

In [169]:
os.makedirs(out_dir,exist_ok=True)
pred_df.to_csv(os.path.join(out_dir,'submission.csv'),index=False)

In [38]:
pred_df

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0924243001 0924243002 0918522001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0923758001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0918522001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0923758001 08...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0713997002 0720125039 0740922009 0791587007 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0924243002 0918522001 0923758001 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0762846027 0689365050 0884081001 0794819001 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0924243002 0918522001 0923758001 08...


****Candidate analysis****

In [35]:
pred_df = pred_df.merge(gt_df,on='customer_id')

In [41]:
pred_df['n'] = pred_df['prediction'].apply(len)
pred_df['intersection'] = pred_df.apply(lambda x: set(x['prediction']).intersection(x['ground_truth']),axis=1)
pred_df['n_intersection'] = pred_df['intersection'].apply(len)

In [51]:
tmp = pred_df.loc[pred_df['n_intersection']==0]['ground_truth'].explode('ground_truth').value_counts()
tmp = cudf.from_pandas(tmp)

In [52]:
df_nopred = tmp.to_frame().reset_index().rename(columns={'index':'article_id','ground_truth':'count'}).merge(articles,on='article_id',how='left')

In [61]:
df_nopred.groupby('product_code')['count'].sum().sort_values(ascending=False).iloc[:50]

product_code
706016    1274
573085    1140
884319    1091
762846    1029
685814    1009
915529     968
751471     911
456163     851
685813     847
873279     802
865929     749
863595     724
714790     711
914441     703
610776     692
803757     660
910601     626
919273     615
924243     597
673677     587
568601     582
717490     574
806388     573
562245     564
685816     560
827968     553
896169     552
791587     549
799365     546
372860     544
788575     542
889550     522
872537     512
804992     512
874754     511
871517     510
783346     508
867969     492
911870     478
579541     477
909059     477
866731     466
677930     463
850917     461
863646     460
678942     455
767423     447
770315     438
893432     434
536139     431
Name: count, dtype: int64

In [56]:
df_nopred.columns

Index(['article_id', 'count', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [234]:
df_nopred = pred_df.loc[pred_df['n']==0]['ground_truth'].explode('ground_truth').value_counts()

In [231]:
max_dat = transactions.groupby('article_id')['t_dat'].max().to_frame().reset_index()
min_dat = transactions.groupby('article_id')['t_dat'].min().to_frame().reset_index()

In [236]:
df_nopred = df_nopred.to_frame().reset_index().rename(columns={'index':'article_id'}).merge(max_dat.to_pandas(),on='article_id',how='left')
df_nopred = df_nopred.merge(min_dat.to_pandas(),on='article_id',how='left')

In [62]:
pred_df

Unnamed: 0,customer_id,prediction,ground_truth,n,intersection,n_intersection
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[909370001, 865799006, 918522001, 448509014, 7...",[794321007],12,{},0
1,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,"[909370001, 865799006, 918522001, 448509014, 7...",[624486001],12,{},0
2,00040239317e877c77ac6e79df42eb2633ad38fcac09fc...,"[875272011, 875272012, 909370001, 865799006, 9...","[875272011, 875272012]",12,"{875272011, 875272012}",2
3,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[399136061, 732842014, 556255001, 852219003, 7...","[757926001, 788575004, 640021019]",12,{},0
4,000749135ee9aa3a24c2316ea5ae4f495b39c1653c5612...,"[909370001, 865799006, 918522001, 448509014, 7...","[800691007, 800691007, 800691008, 800691008]",12,{},0
...,...,...,...,...,...,...
67139,fff98edc27fc5d64c3027bf0e3702510143d1a79c3dc9a...,"[909370001, 865799006, 918522001, 448509014, 7...","[898918002, 855198005, 902163001, 913688001, 8...",12,{},0
67140,fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82b...,"[909370001, 865799006, 918522001, 448509014, 7...","[874816003, 911870004]",12,{},0
67141,fffa7d7799eb390a76308454cbdd76e473d65b1497fbe4...,"[909370001, 865799006, 918522001, 448509014, 7...","[861803014, 849886010]",12,{},0
67142,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,"[914441004, 914441005, 881244001, 882612004, 4...","[396135007, 817472007, 715624050, 817472003, 8...",12,{797892001},1


In [27]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,3,Light,9,White,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,5pk regular Placement1,302,Socks,Socks & Tights,1010014,Placement print,9,Black,4,Dark,5,Black,7188,Socks Bin,F,Menswear,3,Menswear,26,Men Underwear,1021,Socks and Tights,Socks in a fine-knit cotton blend with a small...
105538,953763001,953763,SPORT Malaga tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1919,Jersey,A,Ladieswear,1,Ladieswear,2,H&M+,1005,Jersey Fancy,Loose-fitting sports vest top in ribbed fast-d...
105539,956217002,956217,Cartwheel dress,265,Dress,Garment Full body,1010016,Solid,9,Black,4,Dark,5,Black,1641,Jersey,A,Ladieswear,1,Ladieswear,18,Womens Trend,1005,Jersey Fancy,"Short, A-line dress in jersey with a round nec..."
105540,957375001,957375,CLAIRE HAIR CLAW,72,Hair clip,Accessories,1010016,Solid,9,Black,4,Dark,5,Black,3946,Small Accessories,D,Divided,2,Divided,52,Divided Accessories,1019,Accessories,Large plastic hair claw.
