In [1]:
import pandas as pd
import h5py
import numpy as np
import pickle
import json
import math
import time
from glob import glob
import os
from datetime import datetime
import seaborn as sns
from matplotlib import pyplot as plt

In [81]:
class Pareto:
	"""Pareto算法
	用法：
	   pareto = Pareto(100, 25)
	   pareto.transform(20)
	"""

	def __init__(self, max_value: float = 0, eighty_percent_level: float = 1, minimum_threshold: float = 0):
		""""""
		self.max_value = max_value
		self.eighty_percent_level = eighty_percent_level
		self.minimum_threshold = minimum_threshold

	def transform(self, value: float):
		""""""
		if value < self.minimum_threshold:
			return 0
		alpha = math.log(5) / self.eighty_percent_level
		exp = math.exp(-alpha * value)

		return self.max_value * (1 - exp)


class ItemScoring():
	""" 正向打分 """

	def diff_month(self, x):
		""" 距离当前的月份 """
		now = datetime.now()
		now_year, now_month = now.year, now.month

		if isinstance(x, pd._libs.tslibs.timestamps.Timestamp):
			query_year, query_month = x.year, x.month
		else:
			if len(x) > 10:
				query = datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
			else:
				query = datetime.strptime(x, '%Y-%m-%d')

			query_year, query_month = query.year, query.month

		months = (now_year - query_year) * 12 + (now_month - query_month)

		return months

	def diff_day(self, x: str):
		""" 距离当前的 day数值"""

		now = datetime.now()
		if isinstance(x, str):
			if len(x) > 10:
				x = datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
			else:
				x = datetime.strptime(x, '%Y-%m-%d')

		diff_days = (now - x).days

		return diff_days


	def diff_span_day(self, datetime_str: str, point_date: str = '1970-1-1'):
		""" reddit 日期 day差值 """

		if not point_date:
			point_date = datetime.now()

		if isinstance(point_date, str):
			if len(point_date) > 10:
				point_date = datetime.strptime(point_date, "%Y-%m-%d %H:%M:%S")
			else:
				point_date = datetime.strptime(point_date, '%Y-%m-%d')

		if len(datetime_str) > 10:
			date_time = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
		else:
			date_time = datetime.strptime(datetime_str, '%Y-%m-%d')

		# date_time = datetime.strptime(datetime_str, format)
		diff_days = (date_time - point_date).days

		return diff_days


	def span_date(self, datetime_str: str, point_date: str = '2019-1-1'):
		""" reddit日期秒差值 """

		if not point_date:
			point_date = datetime.now()

		if isinstance(point_date, str):
			if len(point_date) > 10:
				point_date = datetime.strptime(point_date, "%Y-%m-%d %H:%M:%S")
			else:
				point_date = datetime.strptime(point_date, '%Y-%m-%d')

		if len(datetime_str) > 10:
			date_time = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
		else:
			date_time = datetime.strptime(datetime_str, '%Y-%m-%d')

		spans = (date_time - point_date)
		# diff_seconds = spans.days * 86400 + spans.seconds

		return spans.days



	def diff_span_month(self, datetime_str: str, point_date: str = '1970-1-1'):
		""" reddit 日期 month 差值 """

		if not point_date:
			point_date = datetime.now()

		if isinstance(point_date, str):
			if len(point_date) > 10:
				point_date = datetime.strptime(point_date, "%Y-%m-%d %H:%M:%S")
			else:
				point_date = datetime.strptime(point_date, '%Y-%m-%d')

		if len(datetime_str) > 10:
			date_time = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
		else:
			date_time = datetime.strptime(datetime_str, '%Y-%m-%d')


		diff_months = (date_time.year - point_date.year) * 12 + (date_time.month - point_date.month)

		return diff_months


	def pareto_dist(self, pareto_func, x):
		""" 帕累托转换 """
		return pareto_func.transform(x)

	def stamp2date(self, timestamp):
		""" 时间戳转日期 """
		time_format = "%Y-%m-%d %H:%M:%S"
		time_local = time.localtime(timestamp)
		new_date = time.strftime(time_format, time_local)
		return new_date

	def hackernews_scoring(self, h_init, h_interact=0, item_month_age=0, h_weight=0):
		""" Hacker News 分值计算 """
		gravity = 1.8
		return ((h_init + (h_interact + 2)) / pow((item_month_age + 2), gravity)) + h_weight

	def reddit_scoring(self, h_init, ups, downs=0, item_date=0, h_weight=0):
		""" reddit 分值计算 """

		score = h_init + ups - downs + h_weight
		order = np.log10(max(abs(score), 1))
		# order = max(abs(score), 1)
		sign = 1 if score > 0 else -1 if score < 0 else 0

		# item_date = item_date / 86400
		return round(order + sign * item_date / 3000, 7)

	def __init__(self, zhuke_data):
		self.click_weight = 0.3
		self.collection_weight = 0.7

		self.data = zhuke_data
		self.data['month2now'] = self.data['publish_time'].apply(lambda x: self.diff_month(x))
		self.data['secondspan'] = self.data['publish_time'].apply(lambda x: self.span_date(x))
		self.max_date_span = self.data['month2now'].max()
		self.median_date_span = self.data['month2now'].median()
		self.max_second_span = self.data['secondspan'].max()
		self.median_second_span = self.data['secondspan'].median()
		self.max_aesthetics_score = self.data['score'].max()
		self.median_aesthetics_score = self.data['score'].median()
		self.max_num_clicks = self.data['num_clicks'].max()
		self.median_num_clicks = self.data['num_clicks'].median()
		self.max_num_collects = self.data['num_collects'].max()
		self.median_num_collects = self.data['num_collects'].median()
		# 帕累托转化
		self.pareto_date = Pareto(self.max_date_span, self.median_date_span)
		self.pareto_span = Pareto(self.max_second_span, self.median_second_span)
		self.pareto_aesthetics = Pareto(self.max_aesthetics_score, self.median_aesthetics_score)
		self.pareto_click = Pareto(self.max_num_clicks, self.median_num_clicks)
		self.pareto_collect = Pareto(self.max_num_collects, self.median_num_collects)

	def get_hackernews_score(self, x):

		aesthetics_score = float(x['score'])
		num_click = float(x['num_clicks'])
		num_collection = float(x['num_clicks'])
		publish_span = float(x['month2now'])

		# 用美学分做初始值
		h_init = self.pareto_aesthetics.transform(aesthetics_score)
		click = self.pareto_click.transform(num_click)
		fav = self.pareto_collect.transform(num_collection)
		# 交互数
		h_interact = h_init + click * self.click_weight + fav * self.collection_weight

		# 时间衰减因子（利用图片发布的月份数）
		item_date = self.pareto_date.transform(publish_span)

		return self.hackernews_scoring(h_init, h_interact, item_date)

	def get_reddit_score(self, x):
		aesthetics_score = float(x['score'])
		num_click = float(x['num_clicks'])
		num_collection = float(x['num_clicks'])
		second_span = int(x['secondspan'])

		h_init = self.pareto_aesthetics.transform(aesthetics_score)
		h_init = 0
		click = self.pareto_click.transform(num_click)
		fav = self.pareto_collect.transform(num_collection)
		item_date = self.pareto_span.transform(second_span)
		h_interact = h_init + click * self.click_weight + fav * self.collection_weight
		# h_interact = h_init + click + fav
		return self.reddit_scoring(h_init=h_init, ups=h_interact, downs=0, item_date=item_date)

	def item_score(self):
		self.data['item_score'] = self.data.apply(lambda x: self.get_hackernews_score(x), axis=1)
		data = self.data
		return data

	def item_score2(self):
		self.data['item_score'] = self.data.apply(lambda x: self.get_reddit_score(x), axis=1)
		data = self.data
		return data


#### 重新制作数据集

In [None]:
## 重新获取数据库数据
import json
import pandas as pd
from tqdm.auto import tqdm
from tortoise import Tortoise
from app.model import spider
from app.model import feature_index, dh_project

async def new_mysql_connection(host, port, user, pwd):
    """ 初始化MySQL链接 spider """
    config = {
        "connections": {
            "spider": f"mysql://{user}:{pwd}@{host}:{port}/spider",
            "dh_project": f"mysql://{user}:{pwd}@{host}:{port}/dh_project",
            "feature_index": f"mysql://{user}:{pwd}@{host}:{port}/feature_index",
        },
        "apps": {
            "spider": {
                "models": ["app.model.spider"],
                "default_connection": "spider",
            },
            "dh_project": {
                "models": ["app.model.dh_project"],
                "default_connection": "dh_project",
            },
        "feature_index": {
                "models": ["app.model.feature_index"],
                "default_connection": "feature_index",
            },
        },
    }
    await Tortoise.init(config)



async def close_mysql_connections():
    await Tortoise.close_connections()


await new_mysql_connection("127.0.0.1", "33061", "root", "FfRyn2b5BKM3MNPz")
db = Tortoise.get_connection('feature_index')


onlines = pd.read_parquet("/data/1_qunosen/project/res/rank/zhuke_online/data/zhuke_online_show.parquet")[['image_id', 'score', 'zm_image_id', 'zm_url']]
zhuke_items = []
for i, row in tqdm(enumerate(onlines.itertuples())):

    # if i > 10:
        # break
    
    mid = row.image_id
    zm_url = row.zm_url
    zm_mid = row.zm_image_id
    score = row.score

    interact = await feature_index.ItemInteract.get_or_none(id=mid)
    if not interact:
        continue

    medias = await dh_project.Media.get_or_none(id=mid)
    if not medias:
        continue
    
    media_project_id = medias.table_id.split(":")[-1] if medias.table_id else ''
    
    projects = await dh_project.Project.get_or_none(id=media_project_id)
    if not projects:
        continue
    
    title = projects.title
    publish_time = projects.publish_time

    media_filekey = json.loads(medias.file).get('key', '') if medias.file else ''
    if media_filekey:
        img_url = f"http://media.zhuke.com/{media_filekey}~736x.jpg"
    else:
        img_url = ''
    