In [112]:
from ckiptagger import WS, POS, NER, construct_dictionary
import pandas as pd
import numpy as np
from pprint import pprint
import re
import pycantonese as pc
import string
import random
from tqdm import tqdm

#Ckip Tagger Dict of Word and POS/NER tags
ws = WS("./ckip_dict")
pos = POS("./ckip_dict")
ner = NER("./ckip_dict")

#Cantonese Dict
file = open("./dict/cantonese_dict_converted_no_english_in_pydict.txt", "r")
Cantonese_Dict = eval(file.read())
Cantonese_Dict = construct_dictionary(Cantonese_Dict)
file.close()

#Stopwords(Chinese Punctuation)
file = open("./stopwords/punctuation_symbol.txt", "r")
punctuation_ch = file.read().split('\n')
file.close()

#Stopwords(Written Chinese)
file = open("./stopwords/written.txt", "r")
stopwords_written = file.read().split('\n')
file.close()

#Stopwords(Cantonese)(Small set)
stop_words_cantonese = pc.stop_words(add=["俾","哋","地","埋"])
stop_words_cantonese = stop_words_cantonese.difference({"冇"})

In [113]:
#Preprocess of whole string(remove useless string)(cofact dataset cleaning)
def remove_bad_string(sentence):
    #dataset cleaning
    sentence = re.sub(r"Upload Date & Time\n.*發布時間.*\n.*\n.*","",sentence)
    sentence = re.sub(r"Add Friend.*Add LINE Friends via QR Code\n.*\n\n.*\n.*","",sentence)
    sentence = re.sub(r"get data from URL","",sentence)
    sentence = re.sub(r"Looked EverywhereFor This Page! (Error 404)","",sentence)
    sentence = re.sub(r"But maybe we can still help you findwhat you're looking for","",sentence)
    
    # remove old style retweet text "RT"
    sentence = re.sub(r'^RT[\s]+', '', sentence)

    # remove hyperlinks
    sentence = re.sub(r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', '', sentence)
    
    # remove hashtags
    # only removing the hash # sign from the word
    #sentence = re.sub(r'#', '', sentence)
    
    #standardize space
    sentence = re.sub(r'\s+', ' ', sentence)
    
    #1,600 -> 1600("," often used for this way)
    sentence = re.sub(r',', '', sentence)
    
    #remove emojify
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    sentence = regrex_pattern.sub(r'',sentence)
    
    return sentence

#tokenization for Chinese
def tokenize(topics, use_can_dict=True):
    if use_can_dict:
        word_sentence_list = ws(topics, sentence_segmentation = True, 
                                segment_delimiter_set = {",", "。", ":", "?", "!", ";"},
                                recommend_dictionary = Cantonese_Dict)
    else:
        word_sentence_list = ws(topics, sentence_segmentation = True, 
                                segment_delimiter_set = {",", "。", ":", "?", "!", ";"})
    
    return word_sentence_list

#Preprocess for token(Stopwords/Punctuations)
def filter_stopword(sentence):
    
    filter_word=[]
    filter_list=[stopwords_written, stop_words_cantonese, string.punctuation, punctuation_ch]
    
    for word in sentence:
        if word == " ":
            continue
        for filter_words in filter_list:
            if word in filter_words:
                break
        else:
            filter_word.append(word)
    
    return filter_word

#Find POS of word segment
#https://github.com/ckiplab/ckiptagger/wiki/POS-Tags
def pos_tag(topic):
    return pos([topic])[0]

#Find Entity(Name/place/Time wtc.) in word segment
#Name Entity Recognition
#https://github.com/ckiplab/ckiptagger/wiki/Entity-Types
def ner_tag(topic, ps):
    return ner([topic], [ps])[0]

#Word Segmentation
def segment(topics, use_can_dict=True):
    
    for i in range(len(topics)):
        topics[i] = remove_bad_string(topics[i])
    topics = tokenize(topics, use_can_dict)
    for i in range(len(topics)):
        topics[i] = filter_stopword(topics[i])
    
    return topics

In [120]:
#Create a Dict that count the word freqs in/not in that category
#format: {(word(str), in_that_cate(bool)): number(int)}
#e.g. {("中國", 1): 50, ("中國", 0): 25}
def count_tweets(result, tweets, ys):
    for y, tweet in zip(ys, tweets):
        for word in tweet:
            # define the key, which is the word and label tuple
            pair = (word,y)
            
            result[pair] = result.get(pair,0) + 1

    return result

#Train Naive Bayes Model
#return 
#logprior: (log prob. of text is that cate)
#loglikelihood (of each word): (log prob. of text belong to cate given that word)
def train_naive_bayes(freqs, train_x, train_y):
    
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:
            # increment the count of unique positive words by 1
            V_pos += 1

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:
            # increment the count of unique negative words by 1
            V_neg += 1

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = len(list(filter(lambda x: x==1, train_y)))

    # Calculate D_neg, the number of negative documents
    D_neg = len(list(filter(lambda x: x==0, train_y)))

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word,1),0)
        freq_neg = freqs.get((word,0),0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos+1)/(N_pos+V)
        p_w_neg = (freq_neg+1)/(N_neg+V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) -np.log(p_w_neg)

    return logprior, loglikelihood

#Prediction of Naive Bayes Model
#The score(prob.) of text belong to that cate
#Normally, if score > 0 mean that text belong to that cate
def naive_bayes_predict(tweet, logprior, loglikelihood):
    
    # process the tweet to get a list of words
    word_l = tweet

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

#Same as above
#But also print the score of each word
def naive_bayes_predict_detail(word_l, logprior, loglikelihood):

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior
    print(f"Prior: {logprior}")

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            print(f"{word}: {loglikelihood[word]}")
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]
        else:
            print(f"{word}: {0}")

    return p

#Testing of Model
#(Feed in testing data to see the accuracy)
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    accuracy = 0  # return this properly
    
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean((np.abs(y_hats-test_y)))

    # Accuracy is 1 minus the error
    accuracy = 1-error

    return accuracy

#### test with data(text) react to "政治" category

In [115]:
df = pd.read_csv('./data/cofact_label.csv')
pos = df[df["政治"] == 1]
pos = pos["topic"]
pos = pos.values.tolist()
pos = segment(pos)

neg = df[df["政治"] == 0]
neg = neg["topic"]
neg = neg.values.tolist()
neg = segment(neg)

In [116]:
# split the data into two pieces, one for training and one for testing (validation set)
train_pos = pos[:int(len(pos)*0.8)]
test_pos = pos[int(len(pos)*0.8):]
train_neg = neg[:int(len(neg)*0.8)]
test_neg = neg[int(len(neg)*0.8):]

#Text
train_x = train_pos + train_neg
test_x = test_pos + test_neg

#True label
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

In [123]:
freqs = count_tweets({}, train_x, train_y)
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)\


print(logprior)
print(len(loglikelihood))

-0.9808292530117266
2654


### Test with a source that is not related to "政治"

In [127]:
w = "一天到晚看著台灣被中國打壓，又聽到政治名嘴的口水戰，讓你誤以為台灣真的沒有競爭力？別再被洗腦啦！台灣在國際間的重要性，可是無人可取代的！"
w = segment([w])[0]
naive_bayes_predict_detail(w, logprior, loglikelihood)

Prior: -0.9808292530117266
一天到晚: 0
台灣: 2.121879976496536
中國: 1.0745609822159778
打壓: 0
聽到: 0
政治: 0
名嘴: 0
口水戰: 0
誤以為: 0
台灣: 2.121879976496536
真的: 1.1233511463854091
競爭力: 0
洗腦: 0
台灣: 2.121879976496536
國際: 2.7609599357862065
間: -0.5348769302181227
重要性: 0
取代: 0


9.808805810647353

In [None]:
#The score(prob.) of text belong to that cate
#Normally, if score > 0 mean that text belong to that cate

### Test with a source that is not related to "政治"

In [125]:
w = "新型冠狀病毒肆虐，全球確診病例突破1千萬，約50萬人死亡。根據美國約翰霍普金斯大學的統計顯示，美國的確診及死亡個案最多，佔全球四分一；巴西的感染及死亡個案排第二，俄羅斯排第三。"
w = segment([w])[0]
naive_bayes_predict_detail(w, logprior, loglikelihood)

Prior: -0.9808292530117266
新型: 0
冠狀: -0.8225590026699043
病毒: -2.87025184603516
肆虐: 0
全球: -0.12941182210995894
確診: -3.3280849396606405
病例: -1.045702553984114
突破: 0
1千萬: 0
約: -1.228024110778068
50萬: 1.6623476471180973
死亡: 0
美國: 1.5288162544935737
約翰霍普金斯: 0
大學: -1.5157061832298488
統計: 0
顯示: 0.969200466558152
美國: 1.5288162544935737
確診: -3.3280849396606405
死亡: 0
個案: -2.6551404664182146
最多: 0
佔: 0
全球: -0.12941182210995894
四分一: 0
巴西: -0.12941182210995805
感染: -2.144314842652223
死亡: 0
個案: -2.6551404664182146
排: 0
第二: -0.8225590026699043
俄羅斯: 0
排: 0
第三: -1.382174790605327


-19.477627241460464

In [132]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 1.0000


#### tbh, the model give 100% acc. is mainly because the content(dataset) is highly duplicated and test size(20) is small.

#### But at least this show that the model do the classification(categorization) in rational direction.

### Test data:

#### label: 1(True) if the text belong to the class("政治"), else 0(False)
#### Predict : prediction
#### text: the text to predict(after segmentation)

In [135]:
for i in range(len(test_x)):
    print(f"label: {test_y[i]}")
    print(f"Predict: {naive_bayes_predict(test_x[i], logprior, loglikelihood) > 0}")
    print(f"Text: {'/'.join(test_x[i])}")

label: 1.0
Predict: True
Text: 台灣/價值/霸/罷/出征/世衛/秘書長/結束/出征/新加波/總理/夫人/台灣/真的/霸氣/發起/罷免/89萬/票/選出來/高雄/市長/高呼/罷免/世衛/秘書長/譚德塞/台灣/罷免/昌/逆/亡/意思/罷免/韓國瑜/貪污/市政/做/爛/內政部/市政/評比/前/二/名/罷免/罷免/譚德塞/好笑/根本/投票/投票/資格/喊/罷免/喊/大聲/先/整理/一下/新加波/總理/夫人/得罪/台灣/網民/始末/好了/ 1/25日/蘇貞昌/宣布/禁止/口罩/包含/新加波/ST/公司/台灣/兩/條/生產線/台灣/徵用/兩/條/產線/通融/新加坡/台灣/生產/口罩/運回/新加坡/ 2月/新加坡/口罩/缺貨/只好/台灣/新加坡/產線/撤回/新加坡/提供/新加坡人/使用/ 2/26/新加坡/總理/夫人/宣布產線/撤回/新加坡/ 4/7/台灣/宣布/捐/口罩/新加坡/總理/夫人/表示/台灣/網民/排山倒海/出征/新加坡/總理/夫人/謾罵/羞辱/譏諷/詛咒/滿滿/台灣/價值/種/國罵/紛紛/出籠/譚德塞/好歹/公開/記者會/指責/台灣/三/分鐘/總理/夫人/含蓄/臉書/寫/Err/台灣/網民/兩/國際/知名度/不/假/辭色/一視同仁/狂酸/怒罵/潑/糞/羞辱/台灣/走進/國際/社會/方式/台灣/價值/國格/蔡英文/真的/不/制止/一下/幼稚園/大學/教育/教導/溝通/兩/幼稚園/學生/爭奪/玩具/吵/時/老師/父母/引導/兩/小學生/誤會/大打出手/時/老師/父母/教育/兩/國中生/發生/討厭/霸凌/時/老師/父母/開導/從小到大/受/教育/教/歧視/羞辱/蠻橫/方式/解決/紛爭/先/誰是誰非/ 光/看看/網路/低級/下流/語言/真的/暈/兩/件/事/發生/字典/字典/命名為/台灣/罵人/詞彙/寶典/真的/種/方式/揚名/國際/島/嗨/好奇/國家/尊敬/中華民國/過去/台灣/世界/看見/早期/非洲/國家/幫忙/建設/農耕隊/踏足/災難/現場/悲憫/救世/慈濟人/助人/榮登/時代/雜誌/陳樹菊/阿嬤/棒球/好手/王建民/網球/王子/盧彥勳/電影界/奇才/李安/麵包/達人/吳寶春/科技業/巨擘/台積電/強國/台灣/彈丸之地/走到/世界/抬頭挺胸/才智/不/輸/世界/貢獻/有目共睹/曾經/驕傲/說/台灣/美/風景/現在/臉/自稱/美/風景/罵人

#### Problem:

1. Most of the data(text) irrelevant to Hong Kong -> prediction is good for cofact dataset does not mean it predict well for HK news/article
    1. Need HK source(dataset) for training