In [1]:
from google.colab import drive
drive.mount('/content/drive')
import csv
import re
import random
import tqdm.notebook as tq
import pandas as pd
import json
import pprint

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1. 讀入語料與構式辭典

In [2]:
file_path = '/content/drive/Shareddrives/LOPE/產學合作/中華電信/2020-2021/指向情緒案/data/threads/cht-2020-merged.txt'

with open(file_path, 'r', encoding = 'utf-8') as f:
  comments = [ line.strip() for line in f ] # comments 中的每一個 element 是一則評論

In [3]:
construcion_path = '/content/drive/Shareddrives/LOPE/產學合作/中華電信/2020-2021/指向情緒案/sentiment-construction-list/constructions_list/constructions_0406.csv'

with open(construcion_path) as f:
    constructions = [ row.strip() for row in f ]

## 2. 從每則評論中找出符合的構式例子

In [4]:
n = 5  # sample size
random.seed(10)

matched = {}
matched_con = {}
for k in tq.tqdm(constructions):
  pat = re.compile(k)
  if k not in matched: matched[k] = []
  
  # Get all cnstr from all comments
  candidates = set()
  for cmt in comments:
    for c in pat.finditer(cmt): candidates.add(c[0])
  
  # Count all cnstr
  matched_con[k] = len(candidates)

  # Sample
  if len(candidates) > n:
    for c in random.sample(candidates, n): matched[k].append(c)
  else:
    for c in candidates: matched[k].append(c)

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




In [5]:
pprint.pprint(matched)

{'(對\\w+)?一點(都|也)不\\w+\\b': ['一點都不香',
                             '一點都不顯得突兀',
                             '一點都不痛快',
                             '一點都不優惠',
                             '一點也不想上班'],
 '(很|也)夠\\w+了\\b': ['也夠長的了',
                    '也夠測一輪了',
                    '很夠但操控的真實感有點不足上次看完台北場的跨年用到遠傳的5g技術感覺5g技術應該會越來越成熟了',
                    '也夠我用了',
                    '很夠看了'],
 '(真的)?有夠\\w+的?\\b': ['有夠蕭貪', '真的有夠卡', '真的有夠腦殘的', '有夠糟', '有夠扯的扯'],
 '(置|拉)板凳等\\w+\\b': ['拉板凳等中華', '置板凳等後續'],
 '\\b\\w+(到|的|得)離譜': ['上傳更是低得離譜',
                      '只是今天晚上慢的離譜',
                      '一堆辦吃到飽的速度低的離譜',
                      '難以置信的離譜',
                      '邏輯到離譜'],
 '\\b\\w+(掰|bye)了': ['lm就掰掰了',
                     '過沒半年就掰了',
                     '所以要和便宜的吃到飽說掰了',
                     '你朋友ptt帳號要掰了',
                     '反正年底就要掰了'],
 '\\b\\w+(的|得)要死': ['我每次都在那邊對消費和出帳對得要死',
                    '我家這4g訊號都差的要死',
                    '也爛的要死',
                    'app跑起來還是慢得要死',
                  

In [6]:
# export as csv file
with open('matched_random.csv', 'w') as f:
    for key in matched.keys():
        f.write("%s,%s\n"%(key, matched[key]))

## 3. 將符合構式的例子及其數量整理成表格

In [7]:
df = pd.DataFrame([matched_con, matched]).T
df.columns = ['count', 'examples']
df = df.sort_values(by='count', ascending=False) # 數量由大到小排序
df

Unnamed: 0,count,examples
直接\w+了\b,672,"[直接辦好了, 直接破100mbps上去了, 直接不見了, 直接領獎了, 直接設在你家就好了]"
(真的)?有夠\w+的?\b,464,"[有夠蕭貪, 真的有夠卡, 真的有夠腦殘的, 有夠糟, 有夠扯的扯]"
也太\w+了?吧?\b,408,"[也太差, 也太傷長輩的眼xddd, 也太刁鑽, 也太扯了吧, 也太難看得出來了吧]"
\b\w+根本\w{2}\b,292,"[解約和繳完金額根本一樣, 後來才知根本不用, 那個根本沒差, 我自己就是用中華網路還是一樣..."
坐等\w+\b,188,"[坐等5g也499, 坐等加一門94的簡訊, 坐等聯繫, 坐等40元, 坐等499吃到飽不限速]"
\b\w+到\w*炸了?,162,"[訊號爛到炸, 桃園以前用遠傳晚上卡到炸, 網頁爛到爆炸, 你599真他媽的盤到爆炸, 爛到爆炸]"
(對\w+)?一點(都|也)不\w+\b,133,"[一點都不香, 一點都不顯得突兀, 一點都不痛快, 一點都不優惠, 一點也不想上班]"
\b\w+屌打\w+\b,113,"[487還是屌打啊, 的資費隨便都屌打中華, 某些教室是每家都爛所以也不用奢望至於其他大部分..."
\b\w+(的|得)要死,105,"[我每次都在那邊對消費和出帳對得要死, 我家這4g訊號都差的要死, 也爛的要死, app跑起..."
\b\w+給力\b,87,"[收訊很給力, 數位門市申辦最給力, 遠傳網速都很給力, 現在亞太最大問題在於基地台不給力,..."


## 4. 用非電信語料 (PTT 男女板) 測試構式辭典

In [8]:
!gdown --id "1Aoeq6dPkgHhxJQonqPtyVPwPoRhpesPC" -O "bg.json" 
with open("bg.json", "r", encoding = "UTF-8") as f:
    bg_articles = json.load(f)['articles']

bg_content = []
for a in bg_articles:
  if 'content' in a:
    bg_content.append(a['content'])

Downloading...
From: https://drive.google.com/uc?id=1Aoeq6dPkgHhxJQonqPtyVPwPoRhpesPC
To: /content/bg.json
0.00B [00:00, ?B/s]5.24MB [00:00, 52.0MB/s]27.3MB [00:00, 125MB/s] 


In [9]:
n = 5  # sample size
random.seed(10)

bg_matched = {}
bg_matched_con = {}
for k in tq.tqdm(constructions):
  pat = re.compile(k)
  if k not in bg_matched: bg_matched[k] = []
  
  # Get all cnstr from all comments
  candidates = set()
  for cmt in bg_content:
    for c in pat.finditer(cmt): candidates.add(c[0])
  
  # Count all cnstr
  bg_matched_con[k] = len(candidates)

  # Sample
  if len(candidates) > n:
    for c in random.sample(candidates, n): bg_matched[k].append(c)
  else:
    for c in candidates: bg_matched[k].append(c)

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




In [10]:
bg_df = pd.DataFrame([bg_matched_con, bg_matched]).T
bg_df.columns = ['count', 'examples']
bg_df = bg_df.sort_values(by='count', ascending=False) # 數量由大到小排序
bg_df

Unnamed: 0,count,examples
直接\w+了\b,33,"[直接花錢找個明買明賣的女生解決就對了, 直接投訴主管了, 直接不演了, 直接講分手了, 直..."
(對\w+)?一點(都|也)不\w+\b,30,"[一點都不想要吃虧, 一點都不現實, 一點也不考慮到女生工作地點跟意願, 一點都不期待不興奮..."
(真的)?有夠\w+的?\b,30,"[有夠蠢, 有夠可憐, 有夠多, 真的有夠不委屈的, 有夠煩]"
也太\w+了?吧?\b,23,"[也太怪力亂神, 也太敢講了吧, 也太多了吧, 也太快就馬上跟我說她答應其他男生的邀約, 也..."
誰\w誰\w+\b,16,"[誰高誰低這時出錢多的人會加重話語權, 誰跟誰告白的, 誰跟誰告白的啊, 誰是誰了, 誰對誰..."
\b\w+根本\w{2}\b,16,"[BMI根本不準, 旁門左道想歪路這跟工作努不努力根本無關, 以後不用養這個老番婆根本賺到,..."
八成\w+\b,15,"[八成確定這是真的理由因為是私下跟她的朋友說的, 八成的人呢, 八成女生跟另一個男生也這樣描..."
\b\w+(的|得)要死,14,"[妳捨得他累得要死, 摳門小氣的要死, 而且外食貴得要死, 很可能你整天累得要死, 已經忙得要死]"
越來越\w\b,13,"[越來越近, 越來越遠, 越來越胖, 越來越忙, 越來越高]"
\b\w+到\w*炸了?,10,"[或是上網到處狂轟亂炸, 根本幸運到爆炸, 煩到爆炸, 現在想想如果交往第14天跟她求婚他應..."
