In [1]:
import random
from collections import Counter, defaultdict
from pymongo import MongoClient
import pickle
import os
import json
from tqdm import tqdm_notebook
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from utils import *
from sklearn.metrics.pairwise import cosine_similarity
import jieba
import re
import matplotlib.pyplot as plt
from datetime import timedelta

In [2]:
client = MongoClient('mongodb://127.0.0.1:27017', username='', password="")
dataset = client.Assess

movie_info = dataset.movie_info
user_phase1_results = dataset.user_phase1_results
user_phase2_results = dataset.user_phase2_results
peer_results = dataset.peer_phase1_results

# Refering Functions

In [3]:
# from ExpPlatform_User_P2/models.py
def get_item_info(item_id):
    tinfo = movie_info.find_one({'movieId': item_id})
    if tinfo:
        return {"title": tinfo['show_title'], 
                "movieId": tinfo['movieId'],
                "itemID": tinfo['show_itemID'],
                "itemUrl": "https://movie.douban.com/subject/{}/".format(tinfo['show_itemID']), 
                "imgUrl": "/ex2" + tinfo['show_imgUrl'],
                "information": tinfo['show_information'],
                "summary": tinfo['show_summary'],
                "directors": tinfo["show_directors"],
                "writers": tinfo["show_writers"],
                "casts": tinfo["show_casts"],
                "genres": tinfo["show_genres"],
                "countries": tinfo["show_countries"],
                "languages": tinfo["show_languages"],
                "date": tinfo["show_date"],
                "duration": tinfo["show_duration"],
                "rating_count": int(tinfo['db_ratings_count']),
                "db_directors": [(t['name'], "/ex2/static/figures/attribute/c_{}.jpg".format(t['id'])) for t in tinfo['db_directors']],
                "db_writers": [(t['name'], "/ex2/static/figures/attribute/c_{}.jpg".format(t['id'])) for t in tinfo['db_writers']],
                "db_casts": [(t['name'], "/ex2/static/figures/attribute/c_{}.jpg".format(t['id'])) for t in tinfo['db_casts']],
                "aka": tinfo['show_aka']}
    return False
    
def get_movie_attributes(item_id, ispeer=False):
    movie_info = get_item_info(item_id)
    ans_attr = []
    namedic = {"db_directors": "导演", "db_writers": "编剧", "db_casts": "主演", "genres": "类型", "countries": "制片国家/地区"}
    for tk in ['db_directors', 'db_writers', 'db_casts']:
        tvs = movie_info[tk]
        for ti, (tv, tv_url) in enumerate(tvs):
            tans = {
                    "key": namedic[tk], 
                    "value": tv,
                    "text": """<b>{}:</b><span style="text-decoration:underline" data-toggle="tooltip" data-placement="right" data-html="true" title='<img src="{}" style="height:200px"/>'>{} </span>""".format(namedic[tk], tv_url, tv),
                    "key_last": False
                    }
            if ti == len(tvs) - 1:
                tans['key_last'] = True
            ans_attr.append(tans)

    for tk in ['genres', 'countries']:
        tvs = movie_info[tk].split(" / ")
        for ti, tv in enumerate(tvs):
            tans = {
                    "key": namedic[tk], 
                    "value": tv,
                    "text": "<b>{}:</b>{} ".format(namedic[tk], tv),
                    "key_last": False,
                    "replay": "no"
                    }
            if ti == len(tvs) - 1:
                tans['key_last'] = True
            ans_attr.append(tans)
    
    if ispeer == False:
        ans_attr.append({"key": "热度（评分数量）", "value": "1", "text": "<b>热度（评分数量）</b>", "key_last": False, "replay": "no"})
        ans_attr.append({"key": "和某部看过的电影相似", "value": "1", "text": "<b>和某部看过的电影相似</b>", "key_last": False, "replay": "no"})
        ans_attr.append({"key": "我的朋友也喜欢", "value": "1", "text": "<b>我的朋友也喜欢</b>", "key_last": False, "replay": "no"})
    else:
        ans_attr.append({"key": "热度（评分数量）", "value": "1", "text": "<b>热度（评分数量）</b>", "key_last": False})
        ans_attr.append({"key": "和Ta某部看过的电影相似", "value": "1", "text": """<span style="text-decoration:underline" data-toggle="tooltip" data-placement="right" data-html="true" title="<div style='width:200px'>例如：你发现这电影和Ta看过的电影A很像，但Ta不喜欢A，那可能就是负向的，反之是无影响或正向"> <b>和某部看过的电影相似</b> </span>""", "key_last": True})
    return ans_attr

""" New """
def get_shown_attributes(item_id, ispeer=False):
    item_attrs = get_movie_attributes(item_id, ispeer=ispeer)
    ans = []
    for ta in item_attrs:
        ans.append("{}={}".format(ta['key'], ta['value']))
    return ans

# <User, Item> Dataframe

In [4]:
user_ids = set([t['user_id'] for t in user_phase2_results.find() if t['user_id'] != "u_2019310837"])

ds_user = PDtable()
for user_id in user_ids:
    
    """ Drop Duplicated """
    print (user_id)
    user_res = [t for t in user_phase2_results.find({"user_id": user_id})]
    if len(user_res) > 1:
        print ("Duplicated", user_id)
        continue
    user_res = user_res[0]
    
    """ For each item """
    for i, (tinfo, tlabel) in enumerate(zip(user_res['rec_item_list'], user_res['label_item_results'])):
        
        ans_label = [t for t in tlabel if t['page'] == "3_detail"][0]
        
        """ For attributes """
        keydic = {u"热度（评分数量）=1": u"pop", u"我的朋友也喜欢=1": u"user", u"和某部看过的电影相似=1": u"item"}
        shown_attrs = get_shown_attributes(tinfo['item'], ispeer=False)
        attr_label = {}
        for ta in shown_attrs:
            if ta in keydic:
                attr_label[keydic[ta]] = "normal"
            else:
                attr_label[ta] = "normal"
        
        err = False
        for _t in ans_label.values():
            if type(_t) == str and _t.split("=")[-1] in ['pos', 'neg', 'normal']:
                _k, _v, _l = _t.split("=")

                key = "{}={}".format(_k, _v)
                if key in keydic:
                    key = keydic[key]
                if key not in attr_label: # 
                    print ("Error", user_id, i, key)
                    err = True
                attr_label[key] = _l

        self_attributes = ["{}={}".format(k, v) for k, v in attr_label.items()] # all attribute labels
        pos_attrs = [k for k, v in attr_label.items() if v == "pos"] # positive attributes
        normal_attrs = [k for k, v in attr_label.items() if v == "normal"] 
        neg_attrs = [k for k, v in attr_label.items() if v == "neg"]
        assert len(pos_attrs) + len(neg_attrs) + len(normal_attrs) == len(attr_label)
        
        if err:
            continue
        
        """ Build """
        ds_user.add(user_id, "user_id")
        ds_user.add(tinfo['item'], "movie_id") #  # item = movielens ID
        
        ds_user.add(ans_label['watch_intent'], "post_watch_intent")
        ds_user.add(ans_label['expected_preference'], "post_expected_preference")
        ds_user.add(ans_label['self_explanation'], "self_explanation")
        ds_user.add(self_attributes, "self_attributes")
        ds_user.add(pos_attrs, "self_pos")
        ds_user.add(normal_attrs, "self_normal")
        ds_user.add(neg_attrs, "self_neg")
        ds_user.add(ans_label['ifwatched'], "ifwatched")
        
        ds_user.add(user_res['user_summary_ans'], "self_summary")
        ds_user.add(user_res['user_rank_ans']['rank-output'], "rank_ans")
            
df_user = ds_user.to_pandas()

u_15123372089
u_18801378212
u_15810795617
u_18811195178
u_15900292575
u_15071347094
u_18222716322
u_18811400801
u_13701195791
u_19801210262
u_18712328742
u_18993873008
u_15313346392
u_17801182378
u_18800182977
u_18221771895
u_19920091165


# <Peer, User, Item> Dataframe

In [5]:
def parse_peer_ans(expl_ans):
    temp = {}
    for tk in expl_ans:
        task_id, task_key = tk.split("-")
        task_id = int(task_id)

        temp.setdefault(task_id, {})
        if task_key in ['itemId', 'peer_explanation_long']:
            temp[task_id][task_key] = expl_ans[tk]
        elif task_key in ['peer_watch_intent', 'peer_preference', 
                          'peer_ifwatched', 'peer_own_preference']:
            temp[task_id][task_key] = int(expl_ans[tk])
        else:
            temp[task_id].setdefault("attr_factors", [])
            temp[task_id]['attr_factors'].append(expl_ans[tk])

    ans = [temp[i] for i in range(1, len(temp) + 1)]
    return ans

def parse_time_list(tinfo):
    summary_time_list = []
    estimate_time_list = []
    summary_flag, estimate_flag = False, False
    for tname, ttime in tinfo['time_list']:
        if tname == "task_begin-out":
            summary_flag, estimate_flag = True, True
        elif summary_flag and tname == "user_summary-in":
            summary_time_list.append([pd.to_datetime(ttime, unit="s"),])
        elif summary_flag and tname == "user_summary-out":
            summary_time_list[-1] = (pd.to_datetime(ttime, unit="s") - summary_time_list[-1][0]).total_seconds()
            summary_flag = False
        elif estimate_flag and tname == "peer_explanation-in":
            estimate_time_list.append([pd.to_datetime(ttime, unit="s"),])
        elif estimate_flag and tname == "peer_explanation-out":
            estimate_time_list[-1] = (pd.to_datetime(ttime, unit="s") - estimate_time_list[-1][0]).total_seconds()
            estimate_flag = False
            
    return summary_time_list, estimate_time_list

In [6]:
exp_ids = [t['user_id'] for t in peer_results.find() if t['user_id'] != "e_2019310837"]

ds_peer = PDtable()
for texp_id in exp_ids:
    
    exp_res = [t for t in peer_results.find({"user_id": texp_id})]
    if len(exp_res) > 1:
        print ("Duplicated", texp_id)
        continue
    exp_res = exp_res[0]
    
    # summary_time_list, estimate_time_list = parse_time_list(exp_res)
    
    print (texp_id, len(exp_res['peer_list']))
    # for peer, task, result, st, et in zip(exp_res['peer_list'], exp_res['task_data'], exp_res['task_results'], summary_time_list, estimate_time_list):
    for peer, task, result in zip(exp_res['peer_list'], exp_res['task_data'], exp_res['task_results']):
        
        if "u_2019310837" in peer:
            continue
        if len(result['peer_explanations_log']) == 0: # empty results
            print ("Empty results")
            continue
        
        
        #--------  Parse Questionnaires to items ---------# 
        log_list = result['peer_explanations_log']
        if len(task['peer_candidates']) != len(result['peer_explanations_log']):
            log_list = parse_peer_ans(result['peer_explanations_ans'])
        
        if len(task['peer_candidates']) != len(log_list):
            print ("Length not match, skip", texp_id, peer, len(result['peer_explanations_log']))
            continue
        
        print ("--", peer)
        #--------  Main ---------# 
        for tm, texp in zip(task['peer_candidates'], log_list):
            
            #--------  For attributes ---------# 
            keydic = {u"热度（评分数量）=1": u"pop", u"Ta的朋友也喜欢=1": u"user", u'口味相似的其它人喜欢=1': "user", u"和某部看过的电影相似=1": u"item"}
            shown_attrs = get_shown_attributes(tm['movieId'], ispeer=True)
            attr_label = {}
            for ta in shown_attrs:
                if ta in keydic:
                    attr_label[keydic[ta]] = "normal"
                else:
                    attr_label[ta] = "normal"
                    
            attrs = json.loads(texp['attr_factors']) if type(texp['attr_factors']) != list else texp['attr_factors']
            for _t in attrs:
                if type(_t) == str and _t.split("=")[-1] in ['pos', 'neg', 'normal']:
                    _k, _v, _l = _t.split("=")
                    
                    key = "{}={}".format(_k, _v)
                    if key in keydic:
                        key = keydic[key]
                    if key not in attr_label: # 
                        print ("Error", key)
                    attr_label[key] = _l

            peer_attributes = ["{}={}".format(k, v) for k, v in attr_label.items()] # all attribute labels
            pos_attrs = [k for k, v in attr_label.items() if v == "pos"] # positive attributes
            normal_attrs = [k for k, v in attr_label.items() if v == "normal"] 
            neg_attrs = [k for k, v in attr_label.items() if v == "neg"]
            assert len(pos_attrs) + len(neg_attrs) + len(normal_attrs) == len(attr_label)
            
            
            #--------  BUILD ---------# 
            ds_peer.add(peer, "user_id") # peer's peer is user
            ds_peer.add(texp_id if texp_id != "e_130152427992" else "e_13015242799", "peer_id") # TODO: bug
            
            ds_peer.add(tm['movieId'], 'movie_id')
            ds_peer.add(tm['title'], 'movie_title')
            ds_peer.add(int(texp['peer_watch_intent']), "peer_watch_intent")
            ds_peer.add(int(texp['peer_preference']), "peer_preference")
            ds_peer.add(texp['peer_explanation_long'], 'peer_explanation_long')
            
            ds_peer.add(peer_attributes, "peer_attributes")
            ds_peer.add(pos_attrs, "peer_pos")
            ds_peer.add(normal_attrs, "peer_normal")
            ds_peer.add(neg_attrs, "peer_neg")
            
            ds_peer.add(int(texp['peer_ifwatched']), 'peer_ifwatched')
            ds_peer.add(int(texp['peer_own_preference']), 'peer_own_preference')
            
            # ds_peer.add(st, "summary_time")
            # ds_peer.add(et, "estimate_time")
            
            ds_peer.add(result['user_summary_ans']["user_summary"], "user_summary")
        
df_peer = ds_peer.to_pandas()

e_15033065144 4
-- u_18221771895
-- u_18397123106
-- u_18811400801
e_18811378926 4
-- u_17801182378
-- u_13572956735
-- u_18993873008
e_15313387657 4
-- u_17801182378
-- u_13572956735
-- u_18993873008
e_13015242799 4
-- u_19920091165
-- u_15313346392
-- u_13701195791
e_13552099266 4
-- u_19920091165
-- u_15810795617
-- u_18811195178
e_18811307291 4
-- u_18221771895
-- u_18397123106
-- u_18811400801
e_13522737135 4
-- u_18712328742
-- u_19801210262
-- u_15313346392
e_18701593682 4
-- u_15810795617
-- u_15900292575
-- u_18222716322
e_18810755387 4
-- u_15900292575
-- u_17801182378
-- u_18811400801
e_18211082695 4
-- u_15313346392
-- u_15071347094
-- u_18712328742
e_13001261200 4
-- u_18993873008
-- u_15810795617
-- u_15071347094
e_18813035877 4
-- u_15232417786
-- u_18222716322
-- u_18811195178
e_13029231055 4
-- u_18221771895
-- u_15071347094
-- u_18800182977
e_18801357182 4
-- u_18800182977
-- u_15900292575
-- u_19801210262
e_13120301998 4
-- u_15232417786
-- u_18222716322
-- u_1881119

In [7]:
df_up_items = df_user.merge(df_peer, on=["user_id", "movie_id"], how="outer")
df_up_items.reset_index(drop=True, inplace=True)

In [8]:
OUT_PATH = "data/user-study/"
df_user.to_pickle(os.path.join(OUT_PATH, "df_user.pkl"))
df_peer.to_pickle(os.path.join(OUT_PATH, "df_peer.pkl"))
df_up_items.to_pickle(os.path.join(OUT_PATH, "df_up_items.pkl"))

# Statisitcs

In [10]:
finish_user_ids = set(df_up_items[pd.isnull(df_up_items["post_expected_preference"]) == False]['user_id'])
finish_peer_ids = set(df_up_items[pd.isnull(df_up_items["peer_preference"]) == False]['peer_id'])

print ("#Finished Users:", len(finish_user_ids))
print ("#Finished Peers:", len(finish_peer_ids))

#Finished Users: 17
#Finished Peers: 19
