In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import utils
reload(utils)
import json
from collections import Counter, defaultdict
%matplotlib inline

In [2]:
logs = utils.loadlogs()
len(logs)

19

# News

In [3]:
class News:
    viewport_time = 0
    anno_pos, post_pref = None, None
    pre_pref, read_pref, post_pref = None, None, None
    whyclick, uquality, utitle, dwell_time = None, None, None, None
    
    def __init__(self, newsid, position, viewport_time, read_logs=None, read_annotation=None, post_annotation=None):
        """
            newsid, position
            ifviewed, viewport_time
        """
        self.newsid = newsid
        self.imp_position = position
        self.equality = utils.get_news_quality(newsid)
        self.topic = utils.get_news_topic(newsid)
        
        self.ifview = False
        self.ifclick = False
        
        # --------------- view information -------------- #
        if post_annotation:
            self.ifview = True
            self.viewport_time = viewport_time
            self.anno_pos, self.post_pref = post_annotation[0], int(post_annotation[1])
        
        # --------------- click information -------------- #        
        if read_logs and "4" not in read_annotation['why_click'][0]:
            self.ifclick = True
            self.read_logs = read_logs
            self.pre_pref = int(read_annotation['pre-pref'][0])
            self.read_pref = int(read_annotation['read-item-pref'][0])
            self.whyclick = read_annotation['why_click'][0]
            self.uquality = int(read_annotation['quality'][0])
            self.utitle = int(read_annotation['title'][0])
            self.dwell_time = self.__cal_dwell_time()
    
    def __cal_dwell_time(self):
        if len(self.read_logs) > 1:  
            begin_event = json.loads(self.read_logs[0])
            if type(begin_event) == list:
                begin_time = begin_event[0]['time']
            else:
                begin_time = begin_event['time']

            end_time = 0
            for te in self.read_logs:
                if "PAGE_END" in te: # 只到第一次PAGE_END （可能多次点击查看同一条新闻）
                    break
                tevent = json.loads(te)
                if type(tevent) != list:
                    end_time = max(end_time, tevent['time'])

            if end_time != 0:
                browse_time = (int(end_time) - int(begin_time)) / 1000
                return browse_time
        return np.nan

# Task

In [4]:
class Task:
    def __init__(self, tasklog):
        self.task_id = tasklog['task_id']
        self.condition = tasklog['condition']
        self.tasklog = tasklog
        self.imp_list = tasklog['news_list']
        
        self.satisfaction = int(tasklog['list_annotation']['satisfaction'][0])
        self.sat_zscore = self.satisfaction
        self.quality = int(tasklog['list_annotation']['quality'][0])

        if "pair-pref" in tasklog['list_annotation']:
            self.pair_pref = int(tasklog['list_annotation']['pair-pref'][0])
        
        self.read_annotations = tasklog['item_prefs']
        self.post_annotations = tasklog['item_annotation']
        
        self.behavior_logs = dict([(tk[25:], tv) for tk, tv in tasklog['behavior_logs'].items()])
        self.browse_log = self.behavior_logs['/newslist/']
        
        self.news_dict = {}
        self.__init_news()
        
        self.click_list = [utils.get_newsid(tanno[0]) for tanno in self.read_annotations]
        self.browse_time = self.__get_browse_time()
    
    def __init_news(self):
        vt_list = self.__get_viewport_time()
        for tpos, (newsid, vt) in enumerate(zip(self.tasklog['news_list'], vt_list)):
            # newsid
            turl = '/news/{}?state=content'.format(newsid)
            
            read_logs = self.behavior_logs[turl] if turl in self.behavior_logs else None
            
            read_annotation = None
            for tanno in self.read_annotations:
                if utils.get_newsid(tanno[0]) == newsid:
                    read_annotation = tanno[1]
                    break
            
            post_annotation = None
            for ti, tanno in enumerate(self.post_annotations):
                if tanno[0] == newsid:
                    post_annotation = (ti, tanno[1])
                    break
        
            tnews = News(newsid, tpos+1, vt, read_logs, read_annotation, post_annotation)
            self.news_dict[newsid] = tnews
    
    def __get_browse_time(self):
        
        begin_event = json.loads(self.browse_log[0])
        end_event = json.loads(self.browse_log[-1])
        if type(begin_event) == list:
            begin_time = begin_event[0]['time']
        else:
            begin_time = begin_event['time']
            
        if type(end_event) == list:
            end_time = end_event[-1]['time']
        else:
            end_time = end_event['time']
        browse_time = int(end_time) - int(begin_time)
        return browse_time
    
    def __get_viewport_time(self):
        item_heights = self.tasklog['item_heights']
        item_positions = self.tasklog['item_positions']
        screen_height = self.tasklog['screen_height']
        offset_top = item_positions[0] - item_heights[0]
        item_field = [(tpos-theight, tpos) for tpos, theight in zip(item_positions, item_heights)]

        vt_list = np.zeros(shape=(len(item_positions), ))

        last_time = json.loads(self.browse_log[0])['time']
        y = 0
        for te in self.browse_log:
            tlog = json.loads(te)
            if "PAGE_BEGIN" in te and type(tlog) != list:
                last_time = tlog['time']
                continue

            if "SCROLL" in te and type(tlog) != list:
                time_delta = (tlog['time'] - last_time)
                last_time = tlog['time']
                view_field = (y + offset_top, y + screen_height)
                y = tlog['y']

                for ti, item in enumerate(item_field):
                    if (view_field[0] <= item[0] <= view_field[1]) and (view_field[0] <= item[1] <= view_field[1]):
                        vt_list[ti] += time_delta
        return vt_list

# Generate

In [5]:
ds_user = utils.DataSet()
for ulog in logs:
    userid = ulog['studentID']
    topic_pref = ulog['pre_questions']
    
    pre_prefs, read_prefs, post_prefs = [], [], []
    
    satlist = []
    for task_pos, task in enumerate(ulog['tasks'][1:]):
        task = Task(task)
        
        satlist.append(task.satisfaction)
        for newsid in task.imp_list:
            news = task.news_dict[newsid]
            if news.ifview:
                post_prefs.append(news.post_pref)
                
            if news.ifclick:
                pre_prefs.append(news.pre_pref)
                read_prefs.append(news.read_pref)
                
    ds_user.add(userid, "userid")
    ds_user.add(satlist, "satisfactions")
    ds_user.add(np.mean(post_prefs), "post_prefs_mean")
    ds_user.add(np.var(post_prefs), "post_prefs_var")
    
    ds_user.add(np.mean(pre_prefs), "pre_prefs_mean")
    ds_user.add(np.var(pre_prefs), "pre_prefs_var")
    
    ds_user.add(np.mean(read_prefs), "read_prefs_mean")
    ds_user.add(np.var(read_prefs), "read_prefs_var")
    
df_user = ds_user.to_pandas()

In [6]:
ds_news = utils.DataSet()
for ulog in logs:
    userid = ulog['studentID']
    topic_pref = ulog['pre_questions']
    
    for task_pos, task in enumerate(ulog['tasks'][1:]):
        task = Task(task)
        
        for newsid in task.imp_list:
            news = task.news_dict[newsid]
            
            ds_news.add(userid, "userid")
            ds_news.add(newsid, "newsid")
            ds_news.add(news.topic, "topic")
            ds_news.add(int(topic_pref[news.topic][0]), "topic_pref")
            ds_news.add(task.task_id, "taskid")
            ds_news.add(task_pos, "task_pos")

            ds_news.add(task.condition, "condition")
            ds_news.add(news.equality, 'equality')

            ds_news.add(news.imp_position, 'imp_position')
            ds_news.add(news.ifview, 'ifview')
            ds_news.add(news.ifclick, 'ifclick')
            ds_news.add(news.viewport_time, 'viewport_time')
            
            ds_news.add(news.dwell_time, 'dwell_time')
            ds_news.add(news.pre_pref, 'pre_pref')
            ds_news.add(news.read_pref, 'read_pref')
            ds_news.add(news.whyclick, 'whyclick')
            ds_news.add(news.uquality, 'uquality')
            ds_news.add(news.utitle, 'utitle')
            
            ds_news.add(news.post_pref, 'post_pref')

In [7]:
df_news = ds_news.to_pandas()

In [8]:
df_news.to_pickle("df_news.pkl")

In [9]:
df_news

Unnamed: 0,userid,newsid,topic,topic_pref,taskid,task_pos,condition,equality,imp_position,ifview,ifclick,viewport_time,dwell_time,pre_pref,read_pref,whyclick,uquality,utitle,post_pref
0,2014010219,20829158,社会,5,7,0,c0,1,1,True,True,1994.0,30.0,2.0,3.0,3,3.0,4.0,3.0
1,2014010219,20306627,社会,5,7,0,c0,1,2,True,True,4829.0,22.0,3.0,3.0,1,3.0,4.0,3.0
2,2014010219,20130572,社会,5,7,0,c0,1,3,True,False,5088.0,,,,,,,3.0
3,2014010219,20649086,娱乐,3,7,0,c0,1,4,True,False,5062.0,,,,,,,1.0
4,2014010219,21025476,娱乐,3,7,0,c0,1,5,True,True,4317.0,109.0,2.0,4.0,12,4.0,5.0,3.0
5,2014010219,20382034,娱乐,3,7,0,c0,1,6,True,True,4491.0,26.0,5.0,4.0,3,5.0,4.0,5.0
6,2014010219,20287111,科技,3,7,0,c0,1,7,True,True,4719.0,69.0,5.0,4.0,13,3.0,3.0,4.0
7,2014010219,20918819,科技,3,7,0,c0,1,8,True,True,2670.0,42.0,3.0,4.0,1,4.0,5.0,5.0
8,2014010219,20698371,科技,3,7,0,c0,1,9,True,False,277.0,,,,,,,3.0
9,2014010219,21161170,体育,1,7,0,c0,1,10,True,False,0.0,,,,,,,3.0


# Task

In [10]:
def normalize(dlist, userid, key):
    mean = list(df_user[df_user['userid'] == userid][key + "_mean"])[0]
    var = list(df_user[df_user['userid'] == userid][key + "_var"])[0]
    
    return (np.array(dlist) - float(mean)) / var

In [11]:
ds_task = utils.DataSet()
for ulog in logs:
    userid = ulog['studentID']
    
    tasks = []
    satlist = []
    for task_pos, task in enumerate(ulog['tasks'][1:]):
        task = Task(task)
        satlist.append(task.satisfaction)
        tasks.append(task)
        
    mean_sat = np.mean(satlist)
    var_sat = np.var(satlist)
    
    for task_pos, task in enumerate(tasks):
        task.sat_zscore = (task.sat_zscore - float(mean_sat)) / var_sat
        
        post_prefs, clk_post_prefs, pre_prefs, read_prefs = [], [], [], []
        for newsid in task.imp_list:
            news = task.news_dict[newsid]
            if news.ifview:
                post_prefs.append(news.post_pref)
                
            if news.ifclick:
                clk_post_prefs.append(news.post_pref)
                pre_prefs.append(news.pre_pref)
                read_prefs.append(news.read_pref)
            list_topic = news.topic
                
        ds_task.add(userid, 'userid')
        ds_task.add(task.task_id, 'task_id')
        if task.task_id in range(1,6)+range(10,12):
            ds_task.add(list_topic, "topic")
        else:
            ds_task.add("mixed", 'topic')
            
        ds_task.add(task_pos, "task_pos")
        ds_task.add(task.condition, 'condition')
        ds_task.add(task.satisfaction, 'satisfaction')
        ds_task.add(task.sat_zscore, "sat_zscore")
        ds_task.add(task.quality, 'quality')
        
        ds_task.add(int(task.condition[1]), "num_low_news_imp")
        ds_task.add(sum([task.news_dict[tnews].equality for tnews in task.click_list]) if len(pre_prefs) > 0 else 0, "num_high_news_clk")
        ds_task.add(sum([task.news_dict[tnews].equality==0 for tnews in task.click_list]) if len(pre_prefs) > 0 else 0, "num_low_news_clk")

        ds_task.add(len(post_prefs), 'view_cnt')
        ds_task.add(len(pre_prefs), 'click_cnt')
        ds_task.add(len(pre_prefs) / float(len(post_prefs)), "ctr")
        ds_task.add(task.news_dict[task.click_list[0]].imp_position if len(pre_prefs) > 0 else None, "first_click_pos")
        ds_task.add([task.news_dict[tnews].imp_position for tnews in task.click_list] if len(pre_prefs) > 0 else None, "click_pos_list")
        ds_task.add([task.news_dict[tnews].dwell_time for tnews in task.click_list] if len(pre_prefs) > 0 else None, "clk_dwells")
        ds_task.add(task.browse_time, "browse time")

        ds_task.add(post_prefs, "post_prefs")
        ds_task.add(pre_prefs, "pre_prefs")
        ds_task.add(read_prefs, "read_prefs")
        
#         ds_task.add(np.mean(normalize(post_prefs, userid, "post_prefs")) if post_prefs else None, 'mean_post_prefs')
        ds_task.add(np.mean(post_prefs) if post_prefs else None, 'mean_post_prefs')
        ds_task.add(np.sum(post_prefs) if post_prefs else None, 'sum_post_prefs')
        ds_task.add(np.max(post_prefs) if post_prefs else None, 'max_post_prefs')
        ds_task.add(np.min(post_prefs) if post_prefs else None, 'min_post_prefs')

        ds_task.add(np.mean(clk_post_prefs) if clk_post_prefs else None, 'mean_clk_post_prefs')
        ds_task.add(np.sum(clk_post_prefs) if clk_post_prefs else None, 'sum_clk_post_prefs')
        ds_task.add(np.max(clk_post_prefs) if clk_post_prefs else None, 'max_clk_post_prefs')
        ds_task.add(np.min(clk_post_prefs) if clk_post_prefs else None, 'min_clk_post_prefs')
        
#         ds_task.add(np.mean(normalize(pre_prefs, userid, "pre_prefs")) if pre_prefs else None, 'mean_pre_prefs')
        ds_task.add(np.mean(pre_prefs) if pre_prefs else None, 'mean_pre_prefs')
        ds_task.add(np.sum(pre_prefs) if pre_prefs else None, 'sum_pre_prefs')
        ds_task.add(np.max(pre_prefs) if pre_prefs else None, 'max_pre_prefs')
        ds_task.add(np.min(pre_prefs) if pre_prefs else None, 'min_pre_prefs')
        
#         ds_task.add(np.mean(normalize(read_prefs, userid, "read_prefs")) if read_prefs else None, 'mean_read_prefs')
        ds_task.add(np.mean(read_prefs) if read_prefs else None, 'mean_read_prefs')
        ds_task.add(np.sum(read_prefs) if read_prefs else None, 'sum_read_prefs')
        ds_task.add(np.max(read_prefs) if read_prefs else None, 'max_read_prefs')
        ds_task.add(np.min(read_prefs) if read_prefs else None, 'min_read_prefs')
        
        ds_task.add(pre_prefs[-1] if pre_prefs else None, "last_pre_prefs")
        ds_task.add(read_prefs[-1] if read_prefs else None, "last_read_prefs")
        ds_task.add(post_prefs[-1] if post_prefs else None, "last_post_prefs")

In [12]:
df_task = ds_task.to_pandas()

In [13]:
df_task.to_pickle("df_task.pkl")