In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_data(path):
    user = pd.read_csv(path + 'user.csv',header=None)
    item = pd.read_csv(path + 'item.csv',header=None)
    data = pd.read_csv(path + 'user_behavior.csv',header=None)

    data.columns = ['userID','itemID','behavior','timestamp']
    data['day'] = data['timestamp'] // 86400
    data['hour'] = data['timestamp'] // 3600 % 24

    ## 生成behavior的加权
    data['day_hour'] = data['day'] + data['hour'] / float(24)
    data.loc[data['behavior']=='pv','behavior'] = 1
    data.loc[data['behavior']=='fav','behavior'] = 2
    data.loc[data['behavior']=='cart','behavior'] = 3
    #'buy'权重设置为1
    data.loc[data['behavior']=='buy','behavior'] = 1
    
    max_day = max(data['day'])
    min_day = min(data['day'])
    
    #behavior分数加权
    data['behavior'] = (1 - (max_day-data['day_hour']+2)/(max_day-min_day+2)) * data['behavior'] 

    item.columns = ['itemID','category','shop','brand']
    user.columns = ['userID','sex','age','ability']

    data = pd.merge(left=data, right=item, on='itemID',how='left')
    data = pd.merge(left=data, right=user, on='userID',how='left')

    return user, item, data
    
def get_unique_inorder(x, k=50):
    result = []
    flag = set()
    for i in x:
        if i[0] not in flag:
            result.append(i)
            flag.add(i[0])
        if len(flag) > k:
            break
    return result

def get_recall_list(train, targetDay, k=300):
    train_logs = dict()
    
    if targetDay > max(train['day']): #目标天数大于训练集中天数最大值
        for row in train[['userID','itemID','behavior']].values:
            train_logs.setdefault(row[0], dict())
            if row[1] in upward_map:
                train_logs[row[0]].setdefault(upward_map[row[1]],0)
                train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])
    else:
        user_List_test = set(train.loc[train['day']==targetDay,'userID'])
        train = train[train['day'] < targetDay] #选择小于目标天数的数据作为训练集
        
        for row in train[['userID','itemID','behavior']].values:
            if row[0] in user_List_test:
                train_logs.setdefault(row[0], dict())
                if row[1] in upward_map:
                    train_logs[row[0]].setdefault(upward_map[row[1]],0)
                    train_logs[row[0]][upward_map[row[1]]] = max(train_logs[row[0]][upward_map[row[1]]],row[2])

    for each_user in train_logs:
        sum_value = sum(train_logs[each_user].values())
        if sum_value > 0:
            for each_item in train_logs[each_user]:
                train_logs[each_user][each_item] /= sum_value      #behavior权重归一化

    result_logs = dict()    
    for u in train_logs:
        result_logs.setdefault(u, list())
        for i in set(train_logs[u].keys()):
            if i in item_dict:
                tmp_list = [ (x[0], train_logs[u][i]*x[1]) for x in item_dict[i]]  #behavior权重乘物品相似度得到最终的分数
                result_logs[u] += tmp_list
            
    for u in result_logs:
        result_logs[u] = get_unique_inorder([(downward_map[x[0]], x[1]) for x in sorted(result_logs[u], key=lambda x:x[1], reverse=True)
                          if x[0] not in train_logs[u]], k=300)  
    
    return result_logs


def generate_pairs(recall):
    result = []
    for u in recall:
        for i in recall[u]:
            result.append([u,i[0],i[1]])
    return result

def reshape_recall_to_dataframe(recall):
    result = generate_pairs(recall)
    result = pd.DataFrame(result)
    result.columns = ['userID','itemID','apriori']
    return result


In [None]:
#path = './'
# path = '../ECommAI_EUIR_round2_train_20190816/'
path=r'C:\Users\NewtScamander\github\CIKM-2019-AnalytiCup\data\ECommAI_EUIR_round1_testA_20190701'


## The target date(16 means online, 15 means underline test, 14 means underline train)
targetday = 15

## The lenth of recall list, the default is 300
lenth = 300

## The name of generated recall file

name = 'recall_list_round2_%dday_%dlenth.csv'%(targetday, lenth)


user, item, data = load_data(path = path)   

#tempory_path = './tempory_file/'
tempory_path = './'
f = open('upward_map.txt','r')

upward_map = f.read()
upward_map = eval(upward_map)  # upward_map商品ID到商品编码的字典
f.close()
    
f = open('downward_map.txt','r')
downward_map = f.read()
downward_map = eval(downward_map)  # upward_map商品编码到商品ID的字典
f.close()

f = open('item_Apriori.txt','r')
tmp = f.read()
item_dict = eval(tmp)  #商品相似度矩阵
f.close()

In [4]:
recall_logs = get_recall_list(data, targetDay=targetday, k=lenth)

In [5]:
recall_df = reshape_recall_to_dataframe(recall_logs)

In [6]:
temp = pd.merge(left=recall_df, right=data[data['day'] == targetday][['userID','itemID','behavior']], 
         on=['userID','itemID'], how='left').rename(columns={'behavior':'label'})

In [None]:
len(set(recall_df['userID']) & set(data[data['day'] == targetday]['userID']))

In [None]:
len(set(recall_df['userID']))

In [None]:
recall_df.to_csv(name, index=False)