# DFTC模型資料前處理

## 引入套件

In [1]:
import pymysql
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import json
import jieba
jieba.enable_parallel(6)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.582 seconds.
Prefix dict has been built successfully.


## 獲取資料
從資料庫中獲取資料

In [5]:
db = pymysql.connect(host='140.128.102.114', user='user', passwd='pwd', charset='utf8', db='ptt_data')
cursor = db.cursor()

取得文章內容

In [6]:
cursor.execute("select * from ptt_content where tp < '2020-10-01' and title not like '%[公告]%'")
content_result = cursor.fetchall()

取得推文內容

In [7]:
cursor.execute("select * from ptt_push where tp < '2020-10-10'")
push_result = cursor.fetchall()

## 推文資料觀察

In [8]:
push_df = pd.DataFrame(list(push_result))

In [9]:
push_df.columns = ['url', 'seq', 'board', 'tag', 'content', 'tp', 'userid', 'reply']

In [10]:
push_df.head()

Unnamed: 0,url,seq,board,tag,content,tp,userid,reply
0,M.1577893589.A.8F1,1,HatePolitics,推,元兄 裴翊到底有沒有男友,2020-06-18 02:36:00,l9830826,
1,M.1577893589.A.8F1,2,HatePolitics,→,翊沒有男友的機率應該比牛14號打擊率低,2020-06-18 02:39:00,septhsu,--
2,M.1577894164.A.AF0,1,HatePolitics,推,元兄 裴翊到底有沒有男友,2020-06-18 02:36:00,l9830826,
3,M.1577894164.A.AF0,2,HatePolitics,→,翊沒有男友的機率應該比牛14號打擊率低,2020-06-18 02:39:00,septhsu,--
4,M.1578190023.A.E48,1,HatePolitics,推,元兄 裴翊到底有沒有男友,2020-06-18 02:36:00,l9830826,


In [11]:
push_group = push_df.groupby("url")

In [14]:
def get_time_range(row):
    try:
        df = push_group.get_group(row['url'])
        return (df['tp'].values[-1] - df['tp'].values[0]) / np.timedelta64(1, 's')
    except KeyError:
        return 0.0

## 文章資料觀察

In [15]:
content_df = pd.DataFrame(list(content_result))

In [17]:
content_df.columns = ['url', 'title', 'content', 'tp', 'author', 'board', 'ip', 'country']

In [18]:
content_df.head()

Unnamed: 0,url,title,content,tp,author,board,ip,country
0,M.1588262410.A.1AB,[問卦] 肇事責任判斷？,有沒有機車停在慢車道熄火關燈撿東西\n結果燈光昏暗下 撞倒他\n初判出來是 一方未注意車況\...,2020-05-01 00:00:08,maymay82407,Gossiping,101.12.68.2,
1,M.1588262422.A.020,[問卦] 登入破一千五算ptt權威嗎,登入最近破1500了\n這樣應該已經算老鳥了吧\n\n本魯每天用在ptt的時間是10小時up...,2020-05-01 00:00:20,DavFlow,Gossiping,223.140.98.213,
2,M.1588262422.A.127,[問卦] 有沒有邊緣人的八卦,生日都快要過來\n為什麼都沒人來祝賀呢？\n\n是被遺忘了嗎？\n\n還是被遺忘了？\n\n...,2020-05-01 00:00:18,organ63521,Gossiping,180.217.80.208,
3,M.1588262469.A.B09,Re: [新聞] 殺警鄭男判無罪 精神科專家：建構社會安,: 殺警鄭男判無罪 精神科專家：建構社會安全網人人有責\n: 2020-04-30 11:4...,2020-05-01 00:01:07,afiend0927,Gossiping,36.239.188.170,
4,M.1588262484.A.D88,Re: [問卦] 為啥一堆郵輪取名公主號？,請問這家公司的公主號到底有多想停靠台灣補給?\n\n因為疫情關係，機關防守很嚴格拒絕該艘公主...,2020-05-01 00:01:21,kangta2030,Gossiping,61.230.68.131,


### 計算每篇文章的第一篇推文與最後一篇推文的時間距離多久

In [19]:
content_df['tp_range'] = content_df.apply(get_time_range, axis=1)

In [20]:
content_df.head()

Unnamed: 0,url,title,content,tp,author,board,ip,country,tp_range
0,M.1588262410.A.1AB,[問卦] 肇事責任判斷？,有沒有機車停在慢車道熄火關燈撿東西\n結果燈光昏暗下 撞倒他\n初判出來是 一方未注意車況\...,2020-05-01 00:00:08,maymay82407,Gossiping,101.12.68.2,,3180.0
1,M.1588262422.A.020,[問卦] 登入破一千五算ptt權威嗎,登入最近破1500了\n這樣應該已經算老鳥了吧\n\n本魯每天用在ptt的時間是10小時up...,2020-05-01 00:00:20,DavFlow,Gossiping,223.140.98.213,,33240.0
2,M.1588262422.A.127,[問卦] 有沒有邊緣人的八卦,生日都快要過來\n為什麼都沒人來祝賀呢？\n\n是被遺忘了嗎？\n\n還是被遺忘了？\n\n...,2020-05-01 00:00:18,organ63521,Gossiping,180.217.80.208,,420.0
3,M.1588262469.A.B09,Re: [新聞] 殺警鄭男判無罪 精神科專家：建構社會安,: 殺警鄭男判無罪 精神科專家：建構社會安全網人人有責\n: 2020-04-30 11:4...,2020-05-01 00:01:07,afiend0927,Gossiping,36.239.188.170,,59820.0
4,M.1588262484.A.D88,Re: [問卦] 為啥一堆郵輪取名公主號？,請問這家公司的公主號到底有多想停靠台灣補給?\n\n因為疫情關係，機關防守很嚴格拒絕該艘公主...,2020-05-01 00:01:21,kangta2030,Gossiping,61.230.68.131,,9540.0


## 只取第一個小時後 & 一天內的推文內容

In [21]:
content_df_f = content_df.loc[(content_df['tp_range'] > 3600.0) & (content_df['tp_range'] < 86400.0)]

In [23]:
content_df_f.head()

Unnamed: 0,url,title,content,tp,author,board,ip,country,tp_range
1,M.1588262422.A.020,[問卦] 登入破一千五算ptt權威嗎,登入最近破1500了\n這樣應該已經算老鳥了吧\n\n本魯每天用在ptt的時間是10小時up...,2020-05-01 00:00:20,DavFlow,Gossiping,223.140.98.213,,33240.0
3,M.1588262469.A.B09,Re: [新聞] 殺警鄭男判無罪 精神科專家：建構社會安,: 殺警鄭男判無罪 精神科專家：建構社會安全網人人有責\n: 2020-04-30 11:4...,2020-05-01 00:01:07,afiend0927,Gossiping,36.239.188.170,,59820.0
4,M.1588262484.A.D88,Re: [問卦] 為啥一堆郵輪取名公主號？,請問這家公司的公主號到底有多想停靠台灣補給?\n\n因為疫情關係，機關防守很嚴格拒絕該艘公主...,2020-05-01 00:01:21,kangta2030,Gossiping,61.230.68.131,,9540.0
6,M.1588262520.A.358,Re: [新聞] 【殺警判無罪】「思覺失調症裝不出來的！,: 1.媒體來源:\n: 蘋果日報\n: 2.記者署名:\n: 吳慧芬、李恩慈／綜合報導\n...,2020-05-01 00:01:58,ip001,Gossiping,114.136.224.161,,5760.0
7,M.1588262528.A.C9E,[新聞] 世衛開會 評估新冠肺炎是否維持國際緊急,1.媒體來源:\n經濟日報\n\n2.記者署名:\n中央社 記者唐佩君\n\n3.完整新聞標...,2020-05-01 00:02:04,zeuswell,Gossiping,27.52.158.35,,33180.0


## 製作累積量特徵
將每篇文章的所有推文資料以5分鐘為一組做group by  
計算第一個小時中每5分鐘的推文數量成長量 & 分布量

In [61]:
seq_data = []
seq_data2 = []
level = []
for url in tqdm(content_df_f['url']):
    seq_gp = push_group.get_group(url).groupby(pd.Grouper(key='tp',freq='300s'))  # 5分鐘為一組做group by
    seq = seq_gp.count()['url'].values.cumsum().tolist()[:12]  # 成長量
    seq += [-1] * (12-len(seq))  # 資料不足1小時時補齊至1小時
    seq2 = seq_gp.count()['url'].values.tolist()[:12]  # 分布量
    seq2 += [0] * (12-len(seq2))  # 資料不足1小時時補齊至1小時
    level.append(len(push_group.get_group(url)))  # 每篇文章總推文數量
    seq_data.append(seq)
    seq_data2.append(seq2)

100%|██████████| 145522/145522 [09:42<00:00, 249.61it/s]


寫檔

In [62]:
with open("data.json", "w") as f:
    json.dump({
        "urls": content_df_f['url'].values.tolist(),
        "seq_data": seq_data,
        "seq_data2": seq_data2,
        "level": level,
        "content": content_data,
        "author": content_df_f['author'].values.tolist(),
        "publish_time": content_df_f['publish_time'].values.tolist()
    }, f)

## 文章內容前處理
- 斷詞
- 對齊每篇長度

In [32]:
source_data = {"urls": content_df_f['url'].values.tolist(), "seq_data": seq_data, "level": level, "seq_data2": seq_data2}

In [33]:
seq_data = source_data['seq_data']
level = source_data['level']
urls = source_data['urls']

將資料進行shuffle

In [35]:
row_pair = list(zip(seq_data, level, urls))
random.seed(1)
random.shuffle(row_pair)
seq_data, level, urls = zip(*row_pair)

取得stop word & 讀取jieba的自定義字典

In [31]:
# 讀取jieba的自定義字典
jieba.load_userdict("name")
jieba.load_userdict("ldkrsi_dict_zh_tw.txt.big")
jieba.load_userdict("jieba_dict.txt.big")
# 取得stop word
with open("stopwords.json", 'r', encoding='utf8') as f:
    stop_wd = {}
    for w in json.load(f):
        stop_wd[w] = len(stop_wd)
with open("stopwords2.json", 'r', encoding='utf8') as f:
    for w in json.load(f):
        if w not in stop_wd:
            stop_wd[w] = len(stop_wd)

In [32]:
stop_wd_dict = dict([(w, i) for i, w in enumerate(stop_wd)])

將資料進行斷詞

In [39]:
content_data = []
for row in tqdm(content_df_f['content']):
    words = []
    for w in jieba.cut(row):
        try:
            stop_wd_dict[w]
            continue
        except KeyError:
            words.append(w)
    content_data.append(words[:126])  # 將文章詞數量對齊至126個詞

100%|██████████| 145522/145522 [03:47<00:00, 638.48it/s]


## 製作以下三種特徵資料
- publish time
- content length
- fans number of publisher

In [42]:
push4url = push_df.groupby('url')
author_gp = content_df.groupby("author")

In [43]:
pop_author = {}
for author in tqdm(author_gp.groups):
    Aui = 0
    pop_a = 0
    total_comments = 0
    pos = 0
    nag = 0
    for url in author_gp.get_group(author)['url']:
        Aui += 1
        try:
            comments = push4url.get_group(url)
            total_comments += len(comments)
        except:
            comments = pd.DataFrame(columns=['url', 'seq', 'board', 'tag', 'content', 'time', 'userid', 'reply'])
        total_comments += len(comments)
        if len(comments) > 175:
            pop_a += 1
        # 每篇的推、噓總量
        for tag in comments['tag']:
            if tag == "推":
                pos += 1
            elif tag == "噓":
                nag += 1
    pop_author[author] = {
        "pop_a": pop_a / Aui,
        "avg_comment": total_comments / Aui,
        "avg_pos": pos / Aui,
        "avg_nag": nag / Aui
    }

100%|██████████| 19411/19411 [03:26<00:00, 93.87it/s] 


In [44]:
# 存檔
with open("author_feature.json", "w") as f:
    json.dump(pop_author, f)