In [1]:
import numpy as np
import pandas as pd
import math
import os
os.listdir('../input')

['train.csv', 'submission_popular.csv', 'item_metadata.csv', 'test.csv']

In [2]:
GR_COLS = ['user_id','session_id','timestamp','step']

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sub = pd.read_csv('../input/submission_popular.csv')

## get popular items(train)
- get number of cliks that each item received in the df

In [4]:
def get_popularity(df) :
    
    mask = df['action_type'] == 'clickout item'
    
    df_click = df[mask]
    #clickout한 횟수
    df_item_clicks = df_click.groupby('reference').size().reset_index(name = 'n_clicks').transform(lambda x : x.astype(int))
    
    return df_item_clicks

In [5]:
#reference별 click수
df_popular = get_popularity(train)
df_popular.head()

Unnamed: 0,reference,n_clicks
0,100000,1
1,1000005,2
2,100001,1
3,10000234,1
4,1000029,3


## get_submission_target(test)
- identify target rows with missing click outs

In [8]:
def get_submission_target(df) :
    #reference = NA, action type clickout item인 경우
    mask = (df.reference.isnull()) & (df.action_type == 'clickout item')
    df_out = df[mask]
    
    return df_out

In [9]:
df_target = get_submission_target(test)
df_target.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
9,000324D9BBUC,89643988fdbfb,1541593942,10,clickout item,,BG,"Budapest, Hungary",desktop,,923407|1729121|1050792|97171|353141|106315|218...,47|78|76|76|77|79|55|75|78|69|77|56|72|61|47|6...
10,0004Q49X39PY,9de47d9a66494,1541641157,1,clickout item,,PH,"Iloilo City, Philippines",mobile,,2213014|3184842|10213134|4504242|4486372|38120...,53|40|112|57|76|29|42|37|66|66|26|43|28|46|28|...
11,0004Q49X39PY,beea5c27030cb,1541561202,1,clickout item,,PH,"Iloilo City, Philippines",mobile,,3812004|3505150|3202894|2292254|1984229|222789...,29|26|22|15|28|28|21|29|14|20|17|22|21|21|26|2...
13,00071784XQ6B,9617600e1ba7c,1541630328,2,clickout item,,IT,"Turin, Italy",desktop,,22721|3067559|16121|22727|22854|22819|22764|14...,157|289|131|162|60|84|104|102|133|185|222|236|...
19,0008BO33KUQ0,2d0e2102ee0dc,1541636411,6,clickout item,,BR,"Canela, Brazil",mobile,,507861|2176280|1669587|502066|1352530|4342348|...,70|71|49|149|198|143|123|123|90|63|134|112|213...


## explode(df_target, 'impressions')

### string_to_array

In [25]:
def string_to_array(s) :
    
    if isinstance(s, str) :
        out = s.split('|')
    elif math.isnan(s) :
        out = []
    else :
        raise ValueError('Value must be either string of nan')
        
    return out

In [27]:
df_target.impressions.iloc[0]

'923407|1729121|1050792|97171|353141|106315|2182130|904851|1838901|106307|924795|927351|903037|1000915|4234718|2358690|2094950|1033140|97170|2431104|147907|1241375|101758|152107|119494'

In [29]:
df_target.impressions.iloc[0].split('|')

['923407',
 '1729121',
 '1050792',
 '97171',
 '353141',
 '106315',
 '2182130',
 '904851',
 '1838901',
 '106307',
 '924795',
 '927351',
 '903037',
 '1000915',
 '4234718',
 '2358690',
 '2094950',
 '1033140',
 '97170',
 '2431104',
 '147907',
 '1241375',
 '101758',
 '152107',
 '119494']

In [32]:
def explode(df_in, col_expl) :
    #다수의 impressions를 한 행당 한개의 값만 가지도록
    df = df_in.copy()
    df.loc[:, col_expl] = df[col_expl].apply(string_to_array)
    
    df_out = pd.DataFrame({
        col : np.repeat(df[col].values, df[col_expl].str.len())
                       for col in df.columns.drop(col_expl)
    })
    
    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, col_expl] = df_out[col_expl].apply(int)
    
    return df_out

In [33]:
df_expl = explode(df_target, 'impressions')
df_expl.head()

Unnamed: 0,action_type,city,current_filters,device,platform,prices,reference,session_id,step,timestamp,user_id,impressions
0,clickout item,"Budapest, Hungary",,desktop,BG,47|78|76|76|77|79|55|75|78|69|77|56|72|61|47|6...,,89643988fdbfb,10,1541593942,000324D9BBUC,923407
1,clickout item,"Budapest, Hungary",,desktop,BG,47|78|76|76|77|79|55|75|78|69|77|56|72|61|47|6...,,89643988fdbfb,10,1541593942,000324D9BBUC,1729121
2,clickout item,"Budapest, Hungary",,desktop,BG,47|78|76|76|77|79|55|75|78|69|77|56|72|61|47|6...,,89643988fdbfb,10,1541593942,000324D9BBUC,1050792
3,clickout item,"Budapest, Hungary",,desktop,BG,47|78|76|76|77|79|55|75|78|69|77|56|72|61|47|6...,,89643988fdbfb,10,1541593942,000324D9BBUC,97171
4,clickout item,"Budapest, Hungary",,desktop,BG,47|78|76|76|77|79|55|75|78|69|77|56|72|61|47|6...,,89643988fdbfb,10,1541593942,000324D9BBUC,353141


## calc_recommendation(df_expl, df_popular)

Calculate recommendations based on popularity of items.

    The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.

    :param df_expl: Data frame with exploded impression list
    :param df_pop: Data frame with items and number of clicks
    :return: Data frame with sorted impression list according to popularity in df_pop


In [34]:
GR_COLS

['user_id', 'session_id', 'timestamp', 'step']

In [37]:
df_expl[GR_COLS + ['impressions']].head()

Unnamed: 0,user_id,session_id,timestamp,step,impressions
0,000324D9BBUC,89643988fdbfb,1541593942,10,923407
1,000324D9BBUC,89643988fdbfb,1541593942,10,1729121
2,000324D9BBUC,89643988fdbfb,1541593942,10,1050792
3,000324D9BBUC,89643988fdbfb,1541593942,10,97171
4,000324D9BBUC,89643988fdbfb,1541593942,10,353141


In [38]:
df_popular.head()

Unnamed: 0,reference,n_clicks
0,100000,1
1,1000005,2
2,100001,1
3,10000234,1
4,1000029,3


In [41]:
df_expl_clicks = df_expl[GR_COLS + ['impressions']].merge(df_popular, left_on = 'impressions', right_on = 'reference', how = 'left')
df_expl_clicks.head()

Unnamed: 0,user_id,session_id,timestamp,step,impressions,reference,n_clicks
0,000324D9BBUC,89643988fdbfb,1541593942,10,923407,923407.0,3.0
1,000324D9BBUC,89643988fdbfb,1541593942,10,1729121,,
2,000324D9BBUC,89643988fdbfb,1541593942,10,1050792,1050792.0,5.0
3,000324D9BBUC,89643988fdbfb,1541593942,10,97171,97171.0,5.0
4,000324D9BBUC,89643988fdbfb,1541593942,10,353141,353141.0,4.0


In [48]:
def group_concat(df, gr_cols, col_concat) :
    
    df_out = (
        df.groupby(gr_cols)[col_concat].apply(lambda x: ' '.join(x)).to_frame().reset_index()
    )
    
    return df_out

In [49]:
def calc_recommendation(df_expl, df_pop) :
    
    df_expl_clicks = (
        df_expl[GR_COLS + ['impressions']].merge(df_pop, left_on = 'impressions', right_on = 'reference', how = 'left')
    )
    
    df_out = (
        df_expl_clicks.assign(impressions = lambda x : x['impressions'].apply(str))\
        .sort_values(GR_COLS + ['n_clicks'], ascending = [True,True,True,True,False])
    )
    
    df_out = group_concat(df_out, GR_COLS, 'impressions')
    df_out.rename(columns = {'impressions' : 'item_recommendations'}, inplace = True)
    
    return df_out

In [61]:
print("Get popular items...")
df_popular = get_popularity(train)

print("Identify target rows...")
df_target = get_submission_target(test)

print("Get recommendations...")
df_expl = explode(df_target, "impressions")
df_out = calc_recommendation(df_expl, df_popular)

Get popular items...
Identify target rows...
Get recommendations...


In [62]:
df_out.head()

Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
0,000324D9BBUC,89643988fdbfb,1541593942,10,924795 106315 1033140 119494 101758 903037 105...
1,0004Q49X39PY,9de47d9a66494,1541641157,1,3505150 3812004 2227896 2292254 3184842 222702...
2,0004Q49X39PY,beea5c27030cb,1541561202,1,4476010 3505150 3812004 2227896 2292254 222702...
3,00071784XQ6B,9617600e1ba7c,1541630328,2,22854 3067559 22721 22713 16121 22772 22727 22...
4,0008BO33KUQ0,2d0e2102ee0dc,1541636411,6,9857656 5849628 655716 1352530 502066 1405084 ...
