# 导入工具包

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# 数据路径地址

In [2]:
phase = 0
nrows = None
train_path = '../../data/underexpose_train'  
test_path = '../../data/underexpose_test' 

In [3]:
click_train = pd.read_csv(
                        train_path + '/underexpose_train_click-{phase}.csv'.format(phase=phase)
                        ,header=None
                        ,nrows=nrows
                        ,names=['user_id', 'item_id', 'time']
                        ,sep=','
                        ,dtype={'user_id':np.str,'item_id':np.str,'time':np.str}
                        ) 

In [4]:
click_test = pd.read_csv(
                        test_path + '/underexpose_test_click-{phase}/underexpose_test_click-{phase}.csv'.format(phase=phase)
                        ,header=None
                        ,nrows=nrows
                        ,names=['user_id', 'item_id', 'time']
                        ,sep=','
                        ,dtype={'user_id':np.str,'item_id':np.str,'time':np.str}
                        )

# 合并数据集

In [5]:
click_all = click_train.append(click_test)

In [6]:
click_all.head()

Unnamed: 0,user_id,item_id,time
0,4965,18,0.9837634760241426
1,20192,34,0.9837723702817136
2,30128,91,0.9837801863518688
3,29473,189,0.9839301484049712
4,10625,225,0.9839253997117712


# 数据集按时间排序

In [7]:
click_all = click_all.sort_values('time')

# 数据按 user_id, item_id, time 去重，保存最后一条

In [8]:
click_all = click_all.drop_duplicates(['user_id','item_id','time'],keep='last')

# 获取用户 训练用户集 和 测试用户集

In [9]:
set_pred = set(click_test['user_id'])
set_train = set(click_all['user_id']) - set_pred

# 获取 训练集合最后一条数据 当作召回的标签评判召回的效果

In [10]:
temp_ = click_all
temp_['pred'] = temp_['user_id'].map(lambda x: 'test' if x in set_pred else 'train')
temp_ = temp_[temp_['pred']=='train'].drop_duplicates(['user_id'],keep='last')
temp_['remove'] = 'remove'

train_test = click_all
train_test = train_test.merge(temp_,on=['user_id','item_id','time','pred'],how='left')
train_test = train_test[train_test['remove']!='remove']
    
dict_label_user_item = dict(zip(temp_['user_id'],temp_['item_id']))

# 获取热门候选集 用于补全

In [11]:
temp_ = train_test.groupby(['item_id'])['user_id'].count().reset_index()
temp_ = temp_.sort_values(['user_id']) # 修改
hot_list = list(temp_['item_id'])[::-1]

# 召回

In [12]:
from collections import Counter

In [13]:
click_all = train_test

# 统计热门的商品 用于关联规则未召回足够数量商品补全

In [14]:
temp_ = click_all.groupby(['item_id'])['user_id'].count().reset_index()
temp_ = temp_.sort_values(['user_id']) # 修改
hot_list = list(temp_['item_id'])[::-1]
hot_list[:5]

['113569', '52766', '87107', '87254', '31005']

# 统计商品频率

In [15]:
stat_cnt = Counter(list(click_all['item_id'])) 
stat_cnt.most_common(10)

[('113569', 234),
 ('52766', 200),
 ('87107', 193),
 ('87254', 148),
 ('31005', 141),
 ('115860', 136),
 ('116073', 135),
 ('2420', 130),
 ('39291', 128),
 ('82029', 127)]

# 聚合数据以便后续使用

In [18]:
group_by_col, agg_col = 'user_id', 'item_id'
data_ = click_all.groupby(['user_id'])[['item_id','time']].agg({'item_id':lambda x:','.join(list(x)), 'time':lambda x:','.join(list(x))}).reset_index()
data_.head(5)

Unnamed: 0,user_id,item_id,time
0,1,"78142,26646,89568,76240,87533,78380,85492,9779...","0.9837416195438411,0.9837566561259767,0.983763..."
1,10,"74658,114577,9946,69421,59454,68248,110931,864...","0.9837580005480852,0.9837590434057788,0.983786..."
2,100,"62106,550,51671,79574,49546,74156,112139,69773...","0.9837622205572277,0.9837735235447893,0.983812..."
3,10000,632062273,"0.9837585557294344,0.9837843931123657"
4,10002,"71749,27646,82001,102124,45579,54124,7847,3899...","0.9838403516070484,0.9838409345474651,0.983840..."


# 统计用户序列长度

In [19]:
stat_length = np.mean([ len(item_txt.split(',')) for item_txt in data_['item_id']])
stat_length

13.3022426371251

# 构建打分矩阵

In [20]:
matrix_association_rules = {}
print('------- association rules matrix 生成 ---------')
for i, row in tqdm(data_.iterrows()):

    list_item_id = row['item_id'].split(',')
    len_list_item = len(list_item_id)

    for i, item_i in enumerate(list_item_id):
        for j, item_j in enumerate(list_item_id):

            if i <= j:
                if item_i not in matrix_association_rules:
                        matrix_association_rules[item_i] = {}
                if item_j not in matrix_association_rules[item_i]:
                        matrix_association_rules[item_i][item_j] = 0

                alpha, beta, gama = 1.0, 0.8, 0.8
                matrix_association_rules[item_i][item_j] += 1.0 * alpha  / (beta + np.abs(i-j)) * 1.0 / stat_cnt[item_i] * 1.0 / (1 + gama * len_list_item / stat_length)
            if i >= j:
                if item_i not in matrix_association_rules:
                    matrix_association_rules[item_i] = {}
                if item_j not in matrix_association_rules[item_i]:
                    matrix_association_rules[item_i][item_j] = 0

                alpha, beta, gama = 0.5, 0.8, 0.8
                matrix_association_rules[item_i][item_j] += 1.0 * alpha  / (beta + np.abs(i-j)) * 1.0 / stat_cnt[item_i] * 1.0 / (1 + gama * len_list_item / stat_length)

assert len(matrix_association_rules.keys()) == len(set(click_all['item_id']))

21it [00:00, 204.81it/s]

------- association rules matrix 生成 ---------


18505it [00:54, 337.58it/s]


In [21]:
for item, score in matrix_association_rules.items():
    print("item : ", item, " score : ", score, '\n')
    break

item :  78142  score :  {'78142': 0.9422978158521171, '26646': 0.02482166993759562, '89568': 0.015956787817025753, '76240': 0.011757633128334766, '87533': 0.009308126226598358, '78380': 0.0077032768771848475, '85492': 0.006570442042304722, '97795': 0.006451408557301851, '18522': 0.00507715975996274, '47611': 0.004559082233435929, '31443': 0.004136944989599269, '17887': 0.0037863564311586533, '77989': 0.003818800241615233, '109553': 0.00447721407637648, '109061': 0.005409967008954914, '58168': 0.006833642537627259, '81427': 0.009274229158208422, '69707': 0.014426578690546437, '46095': 0.028853157381092873, '113176': 0.00035952540751002477, '113394': 0.0003704865479828914, '113175': 0.00038213706835971183, '114982': 0.00039454411603372834, '115845': 0.00040778385147110185, '13953': 0.00042194301298051514, '93195': 0.00043712081920283576, '19365': 0.0004534312975312998, '28676': 0.0004710061540247611, '93854': 0.0004899983376547918, '77333': 0.0005105865031024721, '87069': 0.0005329806479

In [22]:
# data_

# 召回 50个

In [23]:
k = 50

In [24]:
list_user_id = []
list_item_similar = []
list_score_similar = []
print('------- association rules 召回 ---------')
for i, row in tqdm(data_.iterrows()):

    list_item_id = row['item_id'].split(',')

    dict_item_id_score = {}
    for i, item_i in enumerate(list_item_id[::-1]):
        for item_j, score_similar in sorted(matrix_association_rules[item_i].items(), reverse=True)[0:k]:
            if item_j not in list_item_id:
                if item_j not in dict_item_id_score:
                    dict_item_id_score[item_j] = 0
                sigma = 0.8
                dict_item_id_score[item_j] +=  1.0 / (1 + sigma * i) * score_similar

    dict_item_id_score_topk = sorted(dict_item_id_score.items(), key=lambda kv: kv[1], reverse=True)[:k]
    dict_item_id_set = set([item_similar for item_similar, score_similar in dict_item_id_score_topk])

    # 不足的热度补全
    if len(dict_item_id_score_topk) < k:
        for i, item in enumerate(hot_list):
            if (item not in list_item_id) and (item not in dict_item_id_set):
                item_similar = item
                score_similar = - i - 100 
                dict_item_id_score_topk.append( (item_similar, score_similar) )
            if len(dict_item_id_score_topk) == k:
                break

    assert len(dict_item_id_score_topk) == k
    dict_item_id_set = set([item_similar for item_similar, score_similar in dict_item_id_score_topk])
    assert len(dict_item_id_set) == k
    for item_similar, score_similar in dict_item_id_score_topk:
        list_item_similar.append(item_similar)
        list_score_similar.append(score_similar)
        list_user_id.append(row['user_id'])

28it [00:00, 275.41it/s]

------- association rules 召回 ---------


18505it [01:18, 235.39it/s]


# 打上标签 以便后续rank使用

In [25]:
dict_label = dict_label_user_item

In [26]:
topk_recall = pd.DataFrame({'user_id':list_user_id,'item_similar':list_item_similar,'score_similar':list_score_similar})
topk_recall['next_item_id'] = topk_recall['user_id'].map(dict_label)
topk_recall['pred'] = topk_recall['user_id'].map(lambda x: 'train' if x in dict_label else 'test')

# 查看模型召回效果

In [27]:
sep = 5
data_2 = topk_recall[topk_recall['pred']=='train'].sort_values(['user_id','score_similar'],ascending=False)
data_2 = data_2.groupby(['user_id']).agg({'item_similar':lambda x:list(x),'next_item_id':lambda x:''.join(set(x))})

data_2['index'] = [recall_.index(label_) if label_ in recall_ else -1 for (label_, recall_) in zip(data_2['next_item_id'],data_2['item_similar'])]

print('-------- 召回效果 -------------')
print('--------:phase: ', phase,' -------------')
data_num = len(data_2)
for topk in range(0,k+1,sep):
    hit_num = len(data_2[(data_2['index']!=-1) & (data_2['index']<=topk)]) 
    hit_rate = hit_num * 1.0 / data_num
    print('phase: ', phase, ' top_', topk, ' : ', 'hit_num : ', hit_num, 'hit_rate : ', hit_rate, ' data_num : ', data_num)
    print() 

hit_rate = len(data_2[data_2['index']!=-1]) * 1.0 / data_num

-------- 召回效果 -------------
--------:phase:  0  -------------
phase:  0  top_ 0  :  hit_num :  129 hit_rate :  0.0076594228713929464  data_num :  16842

phase:  0  top_ 5  :  hit_num :  387 hit_rate :  0.02297826861417884  data_num :  16842

phase:  0  top_ 10  :  hit_num :  519 hit_rate :  0.03081581759885999  data_num :  16842

phase:  0  top_ 15  :  hit_num :  625 hit_rate :  0.037109606935043345  data_num :  16842

phase:  0  top_ 20  :  hit_num :  697 hit_rate :  0.041384633653960334  data_num :  16842

phase:  0  top_ 25  :  hit_num :  756 hit_rate :  0.04488778054862843  data_num :  16842

phase:  0  top_ 30  :  hit_num :  815 hit_rate :  0.04839092744329652  data_num :  16842

phase:  0  top_ 35  :  hit_num :  855 hit_rate :  0.05076594228713929  data_num :  16842

phase:  0  top_ 40  :  hit_num :  908 hit_rate :  0.05391283695523097  data_num :  16842

phase:  0  top_ 45  :  hit_num :  945 hit_rate :  0.05610972568578554  data_num :  16842

phase:  0  top_ 50  :  hit_num :  97

# 修改序列召回个数 

In [28]:
k = 50

In [29]:
list_user_id = []
list_item_similar = []
list_score_similar = []
print('------- association rules 召回 ---------')
for i, row in tqdm(data_.iterrows()):

    list_item_id = row['item_id'].split(',')

    dict_item_id_score = {}
    for i, item_i in enumerate(list_item_id[::-1]):
        # for item_j, score_similar in sorted(matrix_association_rules[item_i].items(), reverse=True)[0:k]:
        # 每个item 招回 1888个会不会更好？
        for item_j, score_similar in sorted(matrix_association_rules[item_i].items(), reverse=True)[0:1888]:   
            if item_j not in list_item_id:
                if item_j not in dict_item_id_score:
                    dict_item_id_score[item_j] = 0
                sigma = 0.8
                dict_item_id_score[item_j] +=  1.0 / (1 + sigma * i) * score_similar

    dict_item_id_score_topk = sorted(dict_item_id_score.items(), key=lambda kv: kv[1], reverse=True)[:k]
    dict_item_id_set = set([item_similar for item_similar, score_similar in dict_item_id_score_topk])

    # 不足的热度补全
    if len(dict_item_id_score_topk) < k:
        for i, item in enumerate(hot_list):
            if (item not in list_item_id) and (item not in dict_item_id_set):
                item_similar = item
                score_similar = - i - 100 
                dict_item_id_score_topk.append( (item_similar, score_similar) )
            if len(dict_item_id_score_topk) == k:
                break

    assert len(dict_item_id_score_topk) == k
    dict_item_id_set = set([item_similar for item_similar, score_similar in dict_item_id_score_topk])
    assert len(dict_item_id_set) == k
    for item_similar, score_similar in dict_item_id_score_topk:
        list_item_similar.append(item_similar)
        list_score_similar.append(score_similar)
        list_user_id.append(row['user_id'])

9it [00:00, 89.93it/s]

------- association rules 召回 ---------


18505it [03:16, 93.95it/s] 


# 打上标签

In [30]:
dict_label = dict_label_user_item
topk_recall = pd.DataFrame({'user_id':list_user_id,'item_similar':list_item_similar,'score_similar':list_score_similar})
topk_recall['next_item_id'] = topk_recall['user_id'].map(dict_label)
topk_recall['pred'] = topk_recall['user_id'].map(lambda x: 'train' if x in dict_label else 'test')

In [31]:
data_.head()

Unnamed: 0,user_id,item_id,time
0,1,"78142,26646,89568,76240,87533,78380,85492,9779...","0.9837416195438411,0.9837566561259767,0.983763..."
1,10,"74658,114577,9946,69421,59454,68248,110931,864...","0.9837580005480852,0.9837590434057788,0.983786..."
2,100,"62106,550,51671,79574,49546,74156,112139,69773...","0.9837622205572277,0.9837735235447893,0.983812..."
3,10000,632062273,"0.9837585557294344,0.9837843931123657"
4,10002,"71749,27646,82001,102124,45579,54124,7847,3899...","0.9838403516070484,0.9838409345474651,0.983840..."


In [32]:
topk_recall.head()

Unnamed: 0,item_similar,score_similar,user_id,next_item_id,pred
0,91290,0.086018,1,69359,train
1,42845,0.066693,1,69359,train
2,87837,0.064948,1,69359,train
3,30474,0.061375,1,69359,train
4,19228,0.057324,1,69359,train


# 查看模型召回效果

In [33]:
sep = 5
data_2 = topk_recall[topk_recall['pred']=='train'].sort_values(['user_id','score_similar'],ascending=False)
data_2 = data_2.groupby(['user_id']).agg({'item_similar':lambda x:list(x),'next_item_id':lambda x:''.join(set(x))})

data_2['index'] = [recall_.index(label_) if label_ in recall_ else -1 for (label_, recall_) in zip(data_2['next_item_id'],data_2['item_similar'])]

print('-------- 召回效果 -------------')
print('--------:phase: ', phase,' -------------')
data_num = len(data_2)
for topk in range(0,k+1,sep):
    hit_num = len(data_2[(data_2['index']!=-1) & (data_2['index']<=topk)]) 
    hit_rate = hit_num * 1.0 / data_num
    print('phase: ', phase, ' top_', topk, ' : ', 'hit_num : ', hit_num, 'hit_rate : ', hit_rate, ' data_num : ', data_num)
    print() 

hit_rate = len(data_2[data_2['index']!=-1]) * 1.0 / data_num

-------- 召回效果 -------------
--------:phase:  0  -------------
phase:  0  top_ 0  :  hit_num :  294 hit_rate :  0.017456359102244388  data_num :  16842

phase:  0  top_ 5  :  hit_num :  811 hit_rate :  0.04815342595891224  data_num :  16842

phase:  0  top_ 10  :  hit_num :  1073 hit_rate :  0.06370977318608241  data_num :  16842

phase:  0  top_ 15  :  hit_num :  1243 hit_rate :  0.0738035862724142  data_num :  16842

phase:  0  top_ 20  :  hit_num :  1382 hit_rate :  0.08205676285476785  data_num :  16842

phase:  0  top_ 25  :  hit_num :  1511 hit_rate :  0.08971618572616079  data_num :  16842

phase:  0  top_ 30  :  hit_num :  1601 hit_rate :  0.09505996912480703  data_num :  16842

phase:  0  top_ 35  :  hit_num :  1691 hit_rate :  0.10040375252345327  data_num :  16842

phase:  0  top_ 40  :  hit_num :  1771 hit_rate :  0.10515378221113882  data_num :  16842

phase:  0  top_ 45  :  hit_num :  1845 hit_rate :  0.10954755967224795  data_num :  16842

phase:  0  top_ 50  :  hit_num :

# 优化模型rank

# 获取label 召回的item == item_next 真实 则为1

In [34]:
data_list = []

print('------- 构建样本 -----------')
temp_ = topk_recall
temp_['label'] = [ 1 if next_item_id == item_similar else 0 for (next_item_id, item_similar) in zip(temp_['next_item_id'], temp_['item_similar'])] 

------- 构建样本 -----------


# 为了减少训练样本，降低内存的压力，只取召回样本中存在真实next_item_id训练

In [35]:
set_user_label_1 = set(temp_[temp_['label']==1]['user_id'])
temp_['keep'] = temp_['user_id'].map(lambda x: 1 if x in set_user_label_1 else 0)
train_data = temp_[temp_['keep']==1][['user_id','item_similar','score_similar','label']]

# temp_['pred'] = temp_['user_id'].map(lambda x: 'test' if x in set_pred else 'train')
test_data = temp_[temp_['pred']=='test'][['user_id','item_similar','score_similar']]

# 加入用户行为序列 方便后续构建特征

In [36]:
train_data = train_data.merge(data_,on=['user_id'],how='left')
test_data = test_data.merge(data_,on=['user_id'],how='left')

In [40]:
list_train_test = [('train', train_data), ('test', test_data)]

# 加入训练特征

In [41]:
data_list = []
for flag, data in list_train_test:

    print('------- 加入特征 {flag} -----------'.format(flag=flag))
        
    list_train_flag, list_user_id, list_item_similar, list_label, list_features = [], [], [], [], []
    
    for i,row in tqdm(data.iterrows()):

        user_id, item_id, score_similar = str(row['user_id']), str(row['item_similar']), float(row['score_similar'])
        
        list_item_id = row['item_id'].split(',')[::-1]
        
        feature = [score_similar, len(list_item_id), stat_cnt[item_id]]
#         feature = []
        feature_col_name = ['score_similar','len_item_clicked','recall_item_cnt']
        
        len_ = len(list_item_id)
        
        for i in range(10):
            if i < len_:
                item_i = list_item_id[i]
            
#                 feature += [item_i, stat_cnt[item_i]]
                if (item_i in matrix_association_rules) and (item_id in matrix_association_rules[item_i]):
                    feature += [matrix_association_rules[item_i][item_id]]
                else:
                    feature += [0]
                if (item_id in matrix_association_rules) and (item_i in matrix_association_rules[item_id]):
                    feature += [matrix_association_rules[item_id][item_i]]
                else:
                    feature += [0]
        
            else:
                feature += [0] * 2
                
            feature_col_name += ['clicked_item_'+str(i)+'_to_item_'+str(i)+'_score',
                                 'item_'+str(i)+'_to_'+'clicked_item_'+str(i)+'_score']

        list_features.append(feature)

        list_train_flag.append(flag)
        list_user_id.append(user_id)
        list_item_similar.append(item_id)

        if flag == 'train':
            label = int(row['label'])
            list_label.append(label)

        if flag == 'test':  
            label = -1
            list_label.append(label)

    feature_all = pd.DataFrame(list_features)
    feature_all.columns = ['f_'+str(i) for i in range(len(feature_all.columns))]

    feature_all['train_flag'] = list_train_flag
    feature_all['user_id'] = list_user_id
    feature_all['item_similar'] = list_item_similar
    feature_all['label'] = list_label

    data_list.append(feature_all)

feature_all_train_test = pd.concat(data_list)


print('--------------------------- 特征数据 ---------------------')
len_f = len(feature_all_train_test)
len_train = len(feature_all_train_test[feature_all_train_test['train_flag']=='train'])
len_test = len(feature_all_train_test[feature_all_train_test['train_flag']=='test'])
len_train_1 = len(feature_all_train_test[(feature_all_train_test['train_flag']=='train') & (feature_all_train_test['label']== 1)]) 
print('所有数据条数', len_f)
print('训练数据 : ', len_train)
print('训练数据 label 1 : ', len_train_1)
print('训练数据 1 / 0 rate : ', len_train_1 * 1.0 / len_f)
print('测试数据 : ' , len_test)
print('flag : ', set(feature_all_train_test['train_flag']))
print('--------------------------- 特征数据 ---------------------')

309it [00:00, 3081.50it/s]

------- 加入特征 train -----------


95000it [00:21, 4454.36it/s]
109it [00:00, 1055.10it/s]

------- 加入特征 test -----------


83150it [00:19, 4369.84it/s]


--------------------------- 特征数据 ---------------------
所有数据条数 178150
训练数据 :  95000
训练数据 label 1 :  1900
训练数据 1 / 0 rate :  0.010665169800729722
测试数据 :  83150
flag :  {'train', 'test'}
--------------------------- 特征数据 ---------------------


In [42]:
feature_all_train_test.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_17,f_18,f_19,f_20,f_21,f_22,train_flag,user_id,item_similar,label
0,0.10111,9,7,0.0,0.0,0.150323,0.072382,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,train,10017,115250,0
1,0.074928,9,9,0.049158,0.032772,0.0,0.0,0.067001,0.05351,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,train,10017,105160,0
2,0.043984,9,8,0.027955,0.052613,0.0,0.0,0.031298,0.046947,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,train,10017,112366,0
3,0.042977,9,4,0.020458,0.030688,0.040533,0.020266,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,train,10017,113906,0
4,0.041175,9,13,0.038058,0.040754,0.0,0.0,0.008103,0.007479,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,train,10017,109955,0


# 导入模型 训练 rank

In [43]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


# 获取训练集和测试集

In [44]:
feature_all = feature_all_train_test
train_data = feature_all[feature_all['train_flag']=='train']
test_data = feature_all[feature_all['train_flag']=='test']

# 根据用户切分数据集 验证集占20%

In [45]:
valid=0.2

df_user = pd.DataFrame(list(set(train_data['user_id'])))
df_user.columns = ['user_id']

# df = df_user.sample(frac=1.0)  
df = df_user
cut_idx = int(round(valid * df.shape[0]))
df_train_0, df_train_1 = df.iloc[:cut_idx], df.iloc[cut_idx:]

train_data_0 = df_train_0.merge(train_data,on=['user_id'],how='left')
train_data_1 = df_train_1.merge(train_data,on=['user_id'],how='left')

# 获取特征数据

In [46]:
f_col = [c for c in feature_all.columns if c not in ['train_flag','label','user_id','item_similar']]
f_label = 'label'

X0 = train_data_0[f_col].values
y0 = train_data_0[f_label].values

X1 = train_data_1[f_col].values
y1 = train_data_1[f_label].values

X_pred = test_data[f_col].values

# 训练模型

In [47]:
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X0, y0)

# 验证集验证模型效果并和没有rank的对比

# 获取用户、召回、标签、以及召回score

In [48]:
train_eval = train_data_1[['user_id','item_similar','label','f_0']]
len_hot = len(hot_list)
high_half_item, low_half_item = hot_list[:len_hot//2], hot_list[len_hot//2:] 
train_eval['half'] = train_eval['item_similar'].map(lambda x: 1 if x in low_half_item else 0)

# 验证集预测，并排序

In [49]:
topk = 50

y1_pred = clf.predict_proba(X1)[:,1]
train_eval['pred_prob'] = y1_pred 

train_eval['rank'] = train_eval.groupby(['user_id'])['pred_prob'].rank(ascending=False, method='first')
train_eval['rank_recall'] = train_eval.groupby(['user_id'])['f_0'].rank(ascending=False, method='first')

train_eval_rank = train_eval[train_eval['rank']<=topk]
train_eval_rank_recall = train_eval[train_eval['rank_recall']<=topk]

# 打印模型rank效果并和原始召回对比

In [50]:
recall_rate = hit_rate

#  rank 模型训练 效果

In [51]:
len_user_id = len(set(train_eval.user_id))

hitrate_50_full = np.sum(train_eval_rank['label']) / len_user_id * recall_rate
hitrate_50_half = np.sum(train_eval_rank['label'] * train_eval_rank['half']) / len_user_id * recall_rate
ndcg_50_full = np.sum(train_eval_rank['label'] / np.log2(train_eval_rank['rank'] + 2.0) * recall_rate)
ndcg_50_half = np.sum(train_eval_rank['label'] * train_eval_rank['half'] / np.log2(train_eval_rank['rank'] + 2.0) * recall_rate)

print("------------- eval result -------------")
print("hitrate_50_full : ", hitrate_50_full, 'ndcg_50_full : ', ndcg_50_full, '\n')
print("hitrate_50_half : ", hitrate_50_half, 'ndcg_50_half : ', ndcg_50_half, '\n')
print("------------- eval result -------------")

------------- eval result -------------
hitrate_50_full :  0.11281320508253176 ndcg_50_full :  60.06381568921192 

hitrate_50_half :  0.03362130388314927 ndcg_50_half :  17.758613810817334 

------------- eval result -------------


# 原始直接召回 效果

In [52]:
len_user_id = len(set(train_eval.user_id))

hitrate_50_full = np.sum(train_eval_rank_recall['label']) / len_user_id * recall_rate
hitrate_50_half = np.sum(train_eval_rank_recall['label'] * train_eval_rank_recall['half']) / len_user_id * recall_rate
ndcg_50_full = np.sum(train_eval_rank_recall['label'] / np.log2(train_eval_rank_recall['rank_recall'] + 2.0) * recall_rate)
ndcg_50_half = np.sum(train_eval_rank_recall['label'] * train_eval_rank_recall['half'] / np.log2(train_eval_rank_recall['rank_recall'] + 2.0) * recall_rate)

print("------------- eval result -------------")
print("hitrate_50_full : ", hitrate_50_full, 'ndcg_50_full : ', ndcg_50_full, '\n')
print("hitrate_50_half : ", hitrate_50_half, 'ndcg_50_half : ', ndcg_50_half, '\n')
print("------------- eval result -------------")

------------- eval result -------------
hitrate_50_full :  0.11281320508253176 ndcg_50_full :  58.92521907471739 

hitrate_50_half :  0.03362130388314927 ndcg_50_half :  15.763012514521709 

------------- eval result -------------


# 查看特征重要性

In [55]:
df_feature_importances = pd.DataFrame()
df_feature_importances['f_name'] = feature_col_name
df_feature_importances['importances'] = clf.feature_importances_

In [56]:
df_feature_importances.sort_values(['importances'],ascending=0)

Unnamed: 0,f_name,importances
0,score_similar,0.168788
4,item_0_to_clicked_item_0_score,0.131143
3,clicked_item_0_to_item_0_score,0.120445
1,len_item_clicked,0.098493
2,recall_item_cnt,0.093328
6,item_1_to_clicked_item_1_score,0.06326
5,clicked_item_1_to_item_1_score,0.058691
7,clicked_item_2_to_item_2_score,0.037717
8,item_2_to_clicked_item_2_score,0.035174
10,item_3_to_clicked_item_3_score,0.022363


# 我还想提高怎么办？

## 增加新特征

In [57]:
data_list = []
for flag, data in list_train_test:

    print('------- 加入特征 {flag} -----------'.format(flag=flag))
        
    list_train_flag, list_user_id, list_item_similar, list_label, list_features = [], [], [], [], []
    
    for i,row in tqdm(data.iterrows()):

        user_id, item_id, score_similar = str(row['user_id']), str(row['item_similar']), float(row['score_similar'])
        
        list_item_id = row['item_id'].split(',')[::-1]
        
        feature = [score_similar, len(list_item_id), stat_cnt[item_id]]
        feature_col_name = ['score_similar','len_item_clicked','recall_item_cnt']
#         feature = []
        
        len_ = len(list_item_id)
        
        for i in range(10):
            if i < len_:
                item_i = list_item_id[i]
            
                feature += [item_i, stat_cnt[item_i]]
                if (item_i in matrix_association_rules) and (item_id in matrix_association_rules[item_i]):
                    feature += [matrix_association_rules[item_i][item_id]]
                else:
                    feature += [0]
                if (item_id in matrix_association_rules) and (item_i in matrix_association_rules[item_id]):
                    feature += [matrix_association_rules[item_id][item_i]]
                else:
                    feature += [0]
        
            else:
                feature += [0] * 4
            feature_col_name += ['clicked_item_'+str(i),
                                 'clicked_item_'+str(i)+'_cnt',
                                 'clicked_item_'+str(i)+'_to_item_'+str(i)+'_score',
                                 'item_'+str(i)+'_to_'+'clicked_item_'+str(i)+'_score']

        list_features.append(feature)

        list_train_flag.append(flag)
        list_user_id.append(user_id)
        list_item_similar.append(item_id)

        if flag == 'train':
            label = int(row['label'])
            list_label.append(label)

        if flag == 'test':  
            label = -1
            list_label.append(label)

    feature_all = pd.DataFrame(list_features)
    feature_all.columns = ['f_'+str(i) for i in range(len(feature_all.columns))]

    feature_all['train_flag'] = list_train_flag
    feature_all['user_id'] = list_user_id
    feature_all['item_similar'] = list_item_similar
    feature_all['label'] = list_label

    data_list.append(feature_all)

feature_all_train_test = pd.concat(data_list)


print('--------------------------- 特征数据 ---------------------')
len_f = len(feature_all_train_test)
len_train = len(feature_all_train_test[feature_all_train_test['train_flag']=='train'])
len_test = len(feature_all_train_test[feature_all_train_test['train_flag']=='test'])
len_train_1 = len(feature_all_train_test[(feature_all_train_test['train_flag']=='train') & (feature_all_train_test['label']== 1)]) 
print('所有数据条数', len_f)
print('训练数据 : ', len_train)
print('训练数据 label 1 : ', len_train_1)
print('训练数据 1 / 0 rate : ', len_train_1 * 1.0 / len_f)
print('测试数据 : ' , len_test)
print('flag : ', set(feature_all_train_test['train_flag']))
print('--------------------------- 特征数据 ---------------------')

352it [00:00, 3514.03it/s]

------- 加入特征 train -----------


95000it [00:22, 4136.91it/s]
282it [00:00, 2817.97it/s]

------- 加入特征 test -----------


83150it [00:19, 4306.29it/s]


--------------------------- 特征数据 ---------------------
所有数据条数 178150
训练数据 :  95000
训练数据 label 1 :  1900
训练数据 1 / 0 rate :  0.010665169800729722
测试数据 :  83150
flag :  {'train', 'test'}
--------------------------- 特征数据 ---------------------


In [58]:
feature_all_train_test.head()

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_37,f_38,f_39,f_40,f_41,f_42,train_flag,user_id,item_similar,label
0,0.10111,9,7,104653,12,0.0,0.0,117009,4,0.150323,...,0.0,0.0,0,0,0.0,0.0,train,10017,115250,0
1,0.074928,9,9,104653,12,0.049158,0.032772,117009,4,0.0,...,0.0,0.0,0,0,0.0,0.0,train,10017,105160,0
2,0.043984,9,8,104653,12,0.027955,0.052613,117009,4,0.0,...,0.0,0.0,0,0,0.0,0.0,train,10017,112366,0
3,0.042977,9,4,104653,12,0.020458,0.030688,117009,4,0.040533,...,0.0,0.0,0,0,0.0,0.0,train,10017,113906,0
4,0.041175,9,13,104653,12,0.038058,0.040754,117009,4,0.0,...,0.0,0.0,0,0,0.0,0.0,train,10017,109955,0


# 导入模型

In [59]:
from sklearn.ensemble import RandomForestClassifier

## 获取训练集和测试集

In [60]:
feature_all = feature_all_train_test
train_data = feature_all[feature_all['train_flag']=='train']
test_data = feature_all[feature_all['train_flag']=='test']

# 根据用户切分数据集 验证集占20%

In [61]:
valid=0.2

df_user = pd.DataFrame(list(set(train_data['user_id'])))
df_user.columns = ['user_id']

# df = df_user.sample(frac=1.0)  
df = df_user
cut_idx = int(round(valid * df.shape[0]))
df_train_0, df_train_1 = df.iloc[:cut_idx], df.iloc[cut_idx:]

train_data_0 = df_train_0.merge(train_data,on=['user_id'],how='left')
train_data_1 = df_train_1.merge(train_data,on=['user_id'],how='left')

# 获取特征数据

In [62]:
f_col = [c for c in feature_all.columns if c not in ['train_flag','label','user_id','item_similar']]
f_label = 'label'

X0 = train_data_0[f_col].values
y0 = train_data_0[f_label].values

X1 = train_data_1[f_col].values
y1 = train_data_1[f_label].values

X_pred = test_data[f_col].values

# 训练模型

In [63]:
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X0, y0)

# 获取用户、召回、标签、以及召回score

In [64]:
train_eval = train_data_1[['user_id','item_similar','label','f_0']]
len_hot = len(hot_list)
high_half_item, low_half_item = hot_list[:len_hot//2], hot_list[len_hot//2:] 
train_eval['half'] = train_eval['item_similar'].map(lambda x: 1 if x in low_half_item else 0)

# 验证集预测，并排序

In [65]:
topk = 50

y1_pred = clf.predict_proba(X1)[:,1]
train_eval['pred_prob'] = y1_pred 

train_eval['rank'] = train_eval.groupby(['user_id'])['pred_prob'].rank(ascending=False, method='first')
train_eval['rank_recall'] = train_eval.groupby(['user_id'])['f_0'].rank(ascending=False, method='first')

train_eval_rank = train_eval[train_eval['rank']<=topk]
train_eval_rank_recall = train_eval[train_eval['rank_recall']<=topk]

# 打印模型rank效果并和原始召回对比

In [66]:
recall_rate = hit_rate

# rank 模型训练 效果

In [67]:
len_user_id = len(set(train_eval.user_id))

hitrate_50_full = np.sum(train_eval_rank['label']) / len_user_id * recall_rate
hitrate_50_half = np.sum(train_eval_rank['label'] * train_eval_rank['half']) / len_user_id * recall_rate
ndcg_50_full = np.sum(train_eval_rank['label'] / np.log2(train_eval_rank['rank'] + 2.0) * recall_rate)
ndcg_50_half = np.sum(train_eval_rank['label'] * train_eval_rank['half'] / np.log2(train_eval_rank['rank'] + 2.0) * recall_rate)

print("------------- eval result -------------")
print("hitrate_50_full : ", hitrate_50_full, 'ndcg_50_full : ', ndcg_50_full, '\n')
print("hitrate_50_half : ", hitrate_50_half, 'ndcg_50_half : ', ndcg_50_half, '\n')
print("------------- eval result -------------")

------------- eval result -------------
hitrate_50_full :  0.11281320508253176 ndcg_50_full :  61.584075915351264 

hitrate_50_half :  0.03362130388314927 ndcg_50_half :  18.2432843789127 

------------- eval result -------------


# 原始召回结果

In [68]:
len_user_id = len(set(train_eval.user_id))

hitrate_50_full = np.sum(train_eval_rank_recall['label']) / len_user_id * recall_rate
hitrate_50_half = np.sum(train_eval_rank_recall['label'] * train_eval_rank_recall['half']) / len_user_id * recall_rate
ndcg_50_full = np.sum(train_eval_rank_recall['label'] / np.log2(train_eval_rank_recall['rank_recall'] + 2.0) * recall_rate)
ndcg_50_half = np.sum(train_eval_rank_recall['label'] * train_eval_rank_recall['half'] / np.log2(train_eval_rank_recall['rank_recall'] + 2.0) * recall_rate)

print("------------- eval result -------------")
print("hitrate_50_full : ", hitrate_50_full, 'ndcg_50_full : ', ndcg_50_full, '\n')
print("hitrate_50_half : ", hitrate_50_half, 'ndcg_50_half : ', ndcg_50_half, '\n')
print("------------- eval result -------------")

------------- eval result -------------
hitrate_50_full :  0.11281320508253176 ndcg_50_full :  58.92521907471739 

hitrate_50_half :  0.03362130388314927 ndcg_50_half :  15.763012514521709 

------------- eval result -------------


# 查看特征重要性

In [69]:
df_feature_importances = pd.DataFrame()
# df_feature_importances['f_name'] = f_col
df_feature_importances['f_name'] = feature_col_name
df_feature_importances['importances'] = clf.feature_importances_
df_feature_importances.sort_values(['importances'],ascending=0)

Unnamed: 0,f_name,importances
0,score_similar,0.108926
6,item_0_to_clicked_item_0_score,0.090925
5,clicked_item_0_to_item_0_score,0.081878
2,recall_item_cnt,0.072961
10,item_1_to_clicked_item_1_score,0.044522
9,clicked_item_1_to_item_1_score,0.038812
4,clicked_item_0_cnt,0.027211
3,clicked_item_0,0.027063
13,clicked_item_2_to_item_2_score,0.025555
14,item_2_to_clicked_item_2_score,0.02555


# 我怎么优化half指标

# 获取特征数据

In [70]:
f_col = [c for c in feature_all.columns if c not in ['train_flag','label','user_id','item_similar']]
f_label = 'label'

X0 = train_data_0[f_col].values
y0 = train_data_0[f_label].values

X1 = train_data_1[f_col].values
y1 = train_data_1[f_label].values

X_pred = test_data[f_col].values

# half 数据增加

In [71]:
train_add = train_data_0
len_hot = len(hot_list)
high_half_item, low_half_item = hot_list[:len_hot//2], hot_list[len_hot//2:] 
train_add['half'] = train_eval['item_similar'].map(lambda x: 1 if x in low_half_item else 0)
train_add = train_add[train_add['half']==1]

In [72]:
del train_add['half']

In [73]:
train_data_0_add = pd.concat([train_data_0,train_add,train_add])

In [74]:
X0_add = train_data_0_add[f_col].values
y0_add = train_data_0_add[f_label].values

# 训练模型

In [75]:
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X0, y0)

# 获取用户、召回、标签、以及召回score

In [76]:
train_eval = train_data_1[['user_id','item_similar','label','f_0']]
len_hot = len(hot_list)
high_half_item, low_half_item = hot_list[:len_hot//2], hot_list[len_hot//2:] 
train_eval['half'] = train_eval['item_similar'].map(lambda x: 1 if x in low_half_item else 0)

# 验证集预测，并排序

In [77]:
topk = 50

y1_pred = clf.predict_proba(X1)[:,1]
train_eval['pred_prob'] = y1_pred 

train_eval['rank'] = train_eval.groupby(['user_id'])['pred_prob'].rank(ascending=False, method='first')
train_eval['rank_recall'] = train_eval.groupby(['user_id'])['f_0'].rank(ascending=False, method='first')

train_eval_rank = train_eval[train_eval['rank']<=topk]
train_eval_rank_recall = train_eval[train_eval['rank_recall']<=topk]

# 打印模型rank效果并和原始召回对比

In [78]:
recall_rate = hit_rate

# rank 模型训练 效果

In [277]:
len_user_id = len(set(train_eval.user_id))

hitrate_50_full = np.sum(train_eval_rank['label']) / len_user_id * recall_rate
hitrate_50_half = np.sum(train_eval_rank['label'] * train_eval_rank['half']) / len_user_id * recall_rate
ndcg_50_full = np.sum(train_eval_rank['label'] / np.log2(train_eval_rank['rank'] + 2.0) * recall_rate)
ndcg_50_half = np.sum(train_eval_rank['label'] * train_eval_rank['half'] / np.log2(train_eval_rank['rank'] + 2.0) * recall_rate)

print("------------- eval result -------------")
print("hitrate_50_full : ", hitrate_50_full, 'ndcg_50_full : ', ndcg_50_full, '\n')
print("hitrate_50_half : ", hitrate_50_half, 'ndcg_50_half : ', ndcg_50_half, '\n')
print("------------- eval result -------------")

------------- eval result -------------
hitrate_50_full :  0.11281320508253176 ndcg_50_full :  62.18469719610212 

hitrate_50_half :  0.03295333095831849 ndcg_50_half :  18.341852749227353 

------------- eval result -------------


# 换好模型？

# Stacking？

# 加特征？什么是有效特征？

# 召回提高？