In [1]:
# import packages
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
import collections
warnings.filterwarnings('ignore')

In [11]:
data_path = 'data/'
save_path = 'tmp_results/'

# df节省内存函数

In [8]:
# 节约内存的一个标配函数
def reduce_mem(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

# 读取采样或全量数据

In [14]:
all_click = pd.read_csv(data_path + 'train_click_log.csv')
all_user_ids = all_click.user_id.unique()

In [15]:
all_user_ids   #200000个用户

array([199999, 199998, 199997, ...,      2,      1,      0], dtype=int64)

In [17]:
sample_user_ids = np.random.choice(all_user_ids, size=10000, replace=False)
sample_user_ids

array([182965,  36017, 173760, ...,  93597,  53896, 144854], dtype=int64)

# 从整个数据集中取出采样的用户点击数据

In [18]:
all_click = all_click[all_click['user_id'].isin(sample_user_ids)]
all_click

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5
5,199998,144978,1507029804858,4,1,17,1,25,5
6,199998,285371,1507029834858,4,1,17,1,25,5
14,199994,235230,1507053409054,4,1,17,1,8,2
...,...,...,...,...,...,...,...,...,...
1112544,88053,209122,1508211093336,4,3,20,10,28,2
1112545,88053,224730,1508211232678,4,3,20,10,28,2
1112546,88053,234481,1508211262678,4,3,20,10,28,2
1112619,0,30760,1508211672520,4,1,17,1,25,2


In [19]:
all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp'])) # 去重
all_click

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5
5,199998,144978,1507029804858,4,1,17,1,25,5
6,199998,285371,1507029834858,4,1,17,1,25,5
14,199994,235230,1507053409054,4,1,17,1,8,2
...,...,...,...,...,...,...,...,...,...
1112544,88053,209122,1508211093336,4,3,20,10,28,2
1112545,88053,224730,1508211232678,4,3,20,10,28,2
1112546,88053,234481,1508211262678,4,3,20,10,28,2
1112619,0,30760,1508211672520,4,1,17,1,25,2


In [9]:
# debug模式：从训练集中划出一部分数据来调试代码
def get_all_click_sample(data_path, sample_nums=10000):
    """
        训练集中采样一部分数据调试
        data_path: 原数据的存储路径
        sample_nums: 采样数目（这里由于机器的内存限制，可以采样用户做）
    """
    all_click = pd.read_csv(data_path + 'train_click_log.csv')
    all_user_ids = all_click.user_id.unique()

    sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False) 
    all_click = all_click[all_click['user_id'].isin(sample_user_ids)]
    
    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

# 读取点击数据，这里分成线上和线下，如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中
# 如果是为了线下验证模型的有效性或者特征的有效性，可以只使用训练集
def get_all_click_df(data_path, offline=True):
    if offline:
        all_click = pd.read_csv(data_path + 'train_click_log.csv')
    else:
        trn_click = pd.read_csv(data_path + 'train_click_log.csv')
        tst_click = pd.read_csv(data_path + 'testA_click_log.csv')

        all_click = trn_click.append(tst_click)
    
    all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
    return all_click

In [20]:
# 采样数据
sample_df = get_all_click_sample(data_path,sample_nums=10000)

In [21]:
sample_df

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
56,199982,156624,1507029692899,4,4,20,1,21,1
57,199982,156447,1507029954121,4,4,20,1,21,1
58,199982,149623,1507029984121,4,4,20,1,21,1
119,199963,272143,1507029783117,4,1,17,1,10,2
120,199963,199198,1507029813117,4,1,17,1,10,2
...,...,...,...,...,...,...,...,...,...
1112343,51224,209122,1508210320395,4,1,12,1,6,7
1112344,51224,284470,1508210350395,4,1,12,1,6,7
1112417,42919,234308,1508212604522,4,3,2,1,25,2
1112418,42919,62465,1508212745879,4,3,2,1,25,1


# 获取 用户-文章-点击时间字典

## zip函数解释

In [25]:
seq1 = ['foo', 'bar', 'baz']
seq2 = ['one', 'two', 'three']
zipped = zip(seq1, seq2)
list(zipped)

[('foo', 'one'), ('bar', 'two'), ('baz', 'three')]

In [26]:
list(zip(sample_df['click_article_id'],sample_df['click_timestamp']))

[(156624, 1507029692899),
 (156447, 1507029954121),
 (149623, 1507029984121),
 (272143, 1507029783117),
 (199198, 1507029813117),
 (156624, 1507029647927),
 (156560, 1507030470548),
 (338350, 1507030500548),
 (338350, 1507029725778),
 (62675, 1507029755778),
 (175040, 1507029913037),
 (336476, 1507029943037),
 (24764, 1507032616256),
 (162655, 1507032718370),
 (156560, 1507032748370),
 (336476, 1507030089123),
 (156624, 1507030119123),
 (348111, 1507030548367),
 (156560, 1507030578367),
 (65373, 1507030076561),
 (65794, 1507030106561),
 (74778, 1507030586964),
 (50823, 1507030719356),
 (299697, 1507030749356),
 (182513, 1507030122568),
 (156624, 1507030152568),
 (36605, 1507031711625),
 (199198, 1507031741625),
 (182513, 1507030335957),
 (336476, 1507030365957),
 (156624, 1507030229420),
 (158536, 1507030608621),
 (272143, 1507031182992),
 (162765, 1507031212992),
 (64329, 1507030604031),
 (285343, 1507030782729),
 (284547, 1507055886565),
 (286161, 1507055916565),
 (199198, 1507030380

In [39]:
for i,df in enumerate(sample_df.groupby('user_id')):
    if i==0:
        print(df[1:2])

(         user_id  click_article_id  click_timestamp  click_environment  \
1112021       66            211442    1508209372192                  4   
1112022       66            156279    1508209402192                  4   

         click_deviceGroup  click_os  click_country  click_region  \
1112021                  1        17              1            25   
1112022                  1        17              1            25   

         click_referrer_type  
1112021                    2  
1112022                    2  ,)


In [30]:
def make_item_time_pair(df):
    return list(zip(df['click_article_id'], df['click_timestamp']))
user_item_time_df = sample_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
user_item_time_df

Unnamed: 0,user_id,item_time_list
0,66,"[(211442, 1508209372192), (156279, 15082094021..."
1,75,"[(79851, 1508209112750), (235323, 1508209142750)]"
2,79,"[(331116, 1508209102439), (61452, 1508209132439)]"
3,121,"[(209122, 1508208351053), (211442, 15082083810..."
4,123,"[(205824, 1508208324380), (50644, 1508208354380)]"
...,...,...
9995,199887,"[(175040, 1507029913037), (336476, 15070299430..."
9996,199905,"[(338350, 1507029725778), (62675, 150702975577..."
9997,199933,"[(156624, 1507029647927), (156560, 15070304705..."
9998,199963,"[(272143, 1507029783117), (199198, 15070298131..."


In [22]:
# 根据点击时间获取用户的点击文章序列   {user1: {item1: time1, item2: time2..}...}
def get_user_item_time(click_df):
    
    click_df = click_df.sort_values('click_timestamp')
    
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'], df['click_timestamp']))
    
    user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
                                                            .reset_index().rename(columns={0: 'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    
    return user_item_time_dict

In [23]:
sample_user_item_time_dict = get_user_item_time(sample_df)
sample_user_item_time_dict

{66: [(211442, 1508209372192), (156279, 1508209402192)],
 75: [(79851, 1508209112750), (235323, 1508209142750)],
 79: [(331116, 1508209102439), (61452, 1508209132439)],
 121: [(209122, 1508208351053), (211442, 1508208381053)],
 123: [(205824, 1508208324380), (50644, 1508208354380)],
 141: [(211442, 1508208214146), (156279, 1508208244146)],
 142: [(248438, 1508207869518), (249524, 1508207899518)],
 166: [(70986, 1508207403604), (205824, 1508207433604)],
 168: [(234308, 1508207336088), (205824, 1508207366088)],
 198: [(156279, 1508207023972), (234481, 1508207053972)],
 213: [(36162, 1508206742813),
  (321350, 1508207086298),
  (277107, 1508207116298)],
 252: [(30760, 1508205908436), (315104, 1508205938436)],
 260: [(162355, 1508205727327), (159405, 1508205757327)],
 275: [(224730, 1508206763474), (211442, 1508206793474)],
 308: [(154578, 1508204924515),
  (180879, 1508205232977),
  (180234, 1508205690611),
  (180346, 1508206245093),
  (180260, 1508206865273),
  (180388, 1508207154992),
 

# 获取点击最多的topk个文章

In [40]:
# 获取近期点击最多的文章
def get_item_topk_click(click_df, k):
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click

In [42]:
topk_click = get_item_topk_click(sample_df, 10)
topk_click

Int64Index([336223, 123909, 234698, 336221, 96210, 183176, 168623, 331116,
            336245, 235616],
           dtype='int64')

# itemcf的物品相似度计算

## Python 字典 setdefault() 函数和 get()方法 类似, 如果键不存在于字典中，将会添加键并将值设为默认值。

## dict.setdefault(key, default=None)

In [43]:
def itemcf_sim(df):
    """
        文章与文章之间的相似性矩阵计算
        :param df: 数据表
        :item_created_time_dict:  文章创建时间的字典
        return : 文章与文章的相似性矩阵
        思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习)， 在多路召回部分会加上关联规则的召回策略
    """
    
    user_item_time_dict = get_user_item_time(df)
    
    # 计算物品相似度
    i2i_sim = {}
    item_cnt = defaultdict(int)
    for user, item_time_list in tqdm(user_item_time_dict.items()):
        # 在基于商品的协同过滤优化的时候可以考虑时间因素
        for i, i_click_time in item_time_list:
            item_cnt[i] += 1
            i2i_sim.setdefault(i, {})
            for j, j_click_time in item_time_list:
                if(i == j):
                    continue
                i2i_sim[i].setdefault(j, 0)
                
                i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)
                
    i2i_sim_ = i2i_sim.copy()
    for i, related_items in i2i_sim.items():
        for j, wij in related_items.items():
            i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
    
    # 将得到的相似性矩阵保存到本地
    pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))
    
    return i2i_sim_

In [44]:
a = {}
a.setdefault(0,{})
a

{0: {}}

In [45]:
a[0].setdefault(1,0)
a

{0: {1: 0}}

In [46]:
a[0][1] += 1 / math.log(2 + 1)
a[0][1]

0.9102392266268373

In [47]:
a.items()

dict_items([(0, {1: 0.9102392266268373})])

In [48]:
for i,related_items in a.items():
    print(related_items)

{1: 0.9102392266268373}
