根据`analyze.md`的内容，处理运算数据库数据。

处理目标数据库：invite_info两库。

计算train与test时计算每个人`关注话题命中数`、`感兴趣话题与此话题加权正余弦距离和`、`邀请天数间隔`、`邀请小时`。

In [1]:
import pandas as pd
import pymysql
from tqdm import tqdm

In [35]:
BATCH_SIZE = 2000000  # 每次从数据库最多拿多少数据
DB_NAME = 'zhihu2019_dataset'

In [3]:
class DB():
    def __init__(self, host='localhost', port=3306, db=DB_NAME, user='root', passwd='123456', charset='utf8'):
        self.conn = pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset=charset)   
        self.cur = self.conn.cursor(cursor = pymysql.cursors.DictCursor)

    def __enter__(self): 
        return self.cur

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.conn.commit()
        self.cur.close()
        self.conn.close()

## 1 关注话题命中数

查看已处理数据数。

In [32]:
# 查看已经处理了多少数据
with DB(db=DB_NAME) as db:
    db.execute('select count(*) as length from invite_info_0926')
    length_all = db.fetchone()['length']
    db.execute('select count(*) as length from invite_info_0926 where 关注话题命中数=-1')
    length_need_do = db.fetchone()['length']
    print('%d / %d' % (length_all - length_need_do, length_all))

20016 / 9489162


处理函数。

In [17]:
# 计算某行关注话题命中数
def get_aim(qid, uid, qtid, utid):
    qtid = [] if qtid == -1 else qtid.split(',')
    utid = [] if utid == -1 else utid.split(',')
    qtid.extend(utid)
    aim = len(qtid) - len(set(qtid))
    return aim

In [27]:
# 批量更新表
def batch_update(db, table, data):
    length = len(data)
    data['values'] = data[['问题ID', '用户ID', '关注话题命中数']].apply(
        lambda x: '(\'%s\',\'%s\',%d)' % (x[0], x[1], x[2]), axis=1)
    values = ','.join(data['values'].values)
    
    sql_1 = '''CREATE TEMPORARY TABLE tmp (
            问题ID VARCHAR (20),
            用户ID VARCHAR (20),
            关注话题命中数 INT (11)
        );'''
    sql_2 = 'ALTER TABLE tmp ADD INDEX tmp_index(问题ID, 用户ID);'
    sql_3 = '''INSERT INTO tmp
                VALUES %s;''' % values
    sql_4 = '''UPDATE %s t, tmp
                SET t.关注话题命中数 = tmp.关注话题命中数
                WHERE
                    t.问题ID = tmp.问题ID
                AND t.用户ID = tmp.用户ID;''' % table
    sql_5 = 'DROP TABLE tmp;'
    
    db.execute(sql_1)
    db.execute(sql_2)
    db.execute(sql_3)
    db.execute(sql_4)
    db.execute(sql_5)

**处理数据并更新。**

In [34]:
tables = ['invite_info_0926', 'invite_info_evaluate_1_0926']
for table in tables:
    with DB(db=DB_NAME) as db:
        sql = 'select count(*) as length from %s where 关注话题命中数=-1' % table
        db.execute(sql)
        result = db.fetchone()
        with tqdm(total=result['length']) as pbar:
            while True:
                # 读取一些数据
                sql = '''SELECT
                        t.问题ID,
                        t.用户ID,
                        q.问题绑定话题ID,
                        m.关注话题
                    FROM
                        (
                            (
                                SELECT
                                    问题ID,
                                    用户ID
                                FROM
                                    %s
                                WHERE
                                    关注话题命中数 =- 1
                                LIMIT %d
                            ) t
                            INNER JOIN (
                                SELECT
                                    问题ID,
                                    问题绑定话题ID
                                FROM
                                    question_info_0926
                            ) q ON t.问题ID = q.问题ID
                        )
                    INNER JOIN (
                        SELECT
                            用户ID,
                            关注话题
                        FROM
                            member_info_0926
                    ) m ON t.用户ID = m.用户ID''' % (table, BATCH_SIZE)
                db.execute(sql)
                result = db.fetchall()
                data = pd.DataFrame(result)
                if len(data) == 0:
                    break
                
                # 计算关注话题命中数
                data['关注话题命中数'] = data[['问题ID', '用户ID', '问题绑定话题ID', '关注话题']].apply(
                    lambda x: get_aim(x[0], 
                                       x[1], 
                                       x[2], 
                                       x[3]), 
                    axis=1)
                
                # 写入数据库
                batch_update(db, table, data)
                
                # 刷新进度条
                pbar.update(len(data))


100%|█████████▉| 9466907/9469146 [1:03:57<00:00, 7154.49it/s]
100%|█████████▉| 1141630/1141683 [05:10<00:00, 3489.87it/s]


## 2 感兴趣话题与此话题加权正余弦距离和

## 3 邀请天数间隔

```mysql
CREATE TEMPORARY TABLE tmp AS (
	SELECT
		i.问题ID,
		i.用户ID,
		i.邀请创建时间,
		SUBSTRING_INDEX(
			SUBSTRING_INDEX(
				i.邀请创建时间,
				'H' ,- 2
			),
			'-',
			1
		) - SUBSTRING_INDEX(
			SUBSTRING_INDEX(
				q.问题创建时间,
				'H' ,- 2
			),
			'-',
			1
		) AS 邀请天数间隔
	FROM
		invite_info_0926 i
	INNER JOIN question_info_0926 q ON i.问题ID = q.问题ID
);

SELECT
	*
FROM
	tmp;

UPDATE invite_info_0926 i,
 tmp
SET i.`邀请天数间隔` = tmp.邀请天数间隔
WHERE
	i.邀请创建时间 = tmp.邀请创建时间
AND i.问题ID = tmp.问题ID;

DROP TABLE tmp;
```

## 4 邀请小时

```mysql
update invite_info_0926
set 邀请小时=SUBSTRING_INDEX(邀请创建时间,'H',-1)
```