In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
from import_casa import casa

In [4]:
import pickle
data_path = casa.get_data_path()
with open(data_path / "raw-data-cht-202010.pkl", "rb") as fin:
    threads = pickle.load(fin)

In [5]:
len(threads)

7772

In [6]:
list(threads[0].opinion_texts())

['#問題 499母親節方案要續約嗎\u3000最近各家電信一直打電話傳簡訊詢問要不要續約 有夠煩的但畢竟合約有30個月 而5G未來會普及也是不變事實(?想問板上大大覺得要續約嗎？還是等合約到期 直接續488學生方案呢？',
 '續',
 'B1 怎麼說',
 '5g現在還不穩定 現在辦5g繳5g的錢只能享受4g ，5g全台基地台只有某些地方比較穩 甚至有些地方連4g都有問題 可能還要1.2年才會普及 個人看法',
 'B3 所以也不建議現在就換5g手機囉？',
 '換5g手機可以 但是5g網路可以再等等 如果1.2年內不會申辦5g網路 覺得拿4g手機就夠用了 B4',
 'B5 好的~',
 '我自己 還有家人和同學 都是續30個月 B6',
 '等1111',
 'B8 1111是手機方案到期日嗎😂',
 'B9各大電信優惠日']

In [6]:
out_path = data_path / "opinion_texts.cht-202010.txt"
with out_path.open("w", encoding="UTF-8") as fout:
    try:
        for thread_x in threads:
            fout.write("\n".join(thread_x.opinion_texts()))
    except:
        print(list(thread_x.opinion_texts()))
        

In [7]:
import sentencepiece as spm

In [8]:
spm.SentencePieceTrainer.train(input=out_path, vocab_size=6000, 
                               model_prefix="../../data/eda/spm/cht-202010", 
                               model_type="bpe",
                               add_dummy_prefix=False,
                               split_by_unicode_script=False,                                
                               split_by_number=False)

In [9]:
sp = spm.SentencePieceProcessor(model_file="../../data/eda/spm/cht-202010.model")

## Process by SPM

In [8]:
sp_processor = casa.SpmEdaProcessor("../../data/eda/spm/cht-202010.model")

In [9]:
_ = [x.process(sp_processor) for x in threads]

In [10]:
from itertools import chain, islice
from collections import Counter
op_tokens_iter = (x.opinion_tokens() for x in threads)
thread_tokens_iter = chain.from_iterable(op_tokens_iter)
flat_tokens_iter = chain.from_iterable(thread_tokens_iter)
preproc_iter = map(lambda x: x.replace("▁", ""), flat_tokens_iter)
spm_freq = Counter(preproc_iter)

In [11]:
sorted_spm = sorted(spm_freq.most_common(), key=lambda x: (-len(x[0]), -x[1]))

In [12]:
import pandas as pd
spm_frame = pd.DataFrame.from_records(sorted_spm, columns=["token", "freq"])
spm_frame = spm_frame.loc[spm_frame.token.str.len() > 0, :]

In [13]:
spm_frame.to_csv("../../data/eda/spm/spm_frequency.csv", index=False)

## FastText

In [14]:
from gensim.models import FastText
import re
pat = re.compile("[,▁]")
def sanitize(x):    
    return pat.sub("", x)

op_tokens_iter = (x.opinion_tokens() for x in threads)
thread_tokens = chain.from_iterable(op_tokens_iter)
thread_tokens = list(map(lambda tokens: [sanitize(x) for x in tokens], thread_tokens))
    
model = FastText(size=100, window=5, min_count=2)
model.build_vocab(sentences=thread_tokens)
model.train(sentences=thread_tokens, total_examples=len(thread_tokens), epochs=10)

[INFO] 2020-11-24 15:39:09,733 gensim.models.word2vec: resetting layer weights
[INFO] 2020-11-24 15:39:16,851 gensim.models.word2vec: collecting all words and their counts
[INFO] 2020-11-24 15:39:16,852 gensim.models.word2vec: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
[INFO] 2020-11-24 15:39:16,928 gensim.models.word2vec: PROGRESS: at sentence #10000, processed 484463 words, keeping 5279 word types
[INFO] 2020-11-24 15:39:16,999 gensim.models.word2vec: PROGRESS: at sentence #20000, processed 929943 words, keeping 5905 word types
[INFO] 2020-11-24 15:39:17,061 gensim.models.word2vec: PROGRESS: at sentence #30000, processed 1354191 words, keeping 6283 word types
[INFO] 2020-11-24 15:39:17,125 gensim.models.word2vec: PROGRESS: at sentence #40000, processed 1762574 words, keeping 6578 word types
[INFO] 2020-11-24 15:39:17,128 gensim.models.word2vec: collected 6581 word types from a corpus of 1768673 raw words and 40135 sentences
[INFO] 2020-11-24 15:39:17,131 gensim

In [35]:
model.wv.most_similar(["中華電信", "遠傳電信", "台灣大哥大", "台灣之星", "亞太電信"], topn=20)

[('台灣大', 0.7099623680114746),
 ('中華電', 0.6874696016311646),
 ('台灣之星的', 0.6767827272415161),
 ('亞太電', 0.6112518310546875),
 ('。遠傳', 0.6010882258415222),
 ('中華電信的', 0.5904132723808289),
 ('遠傳', 0.5849370956420898),
 ('今日', 0.5521818399429321),
 ('#iPhone', 0.5488035678863525),
 ('實體門市', 0.546295702457428),
 ('網路門市', 0.5450388789176941),
 ('iPhone', 0.5425416231155396),
 ('。中華電信', 0.5404197573661804),
 ('亞太5G', 0.5296310782432556),
 ('台哥大', 0.5261499881744385),
 ('12、iPhone', 0.525742769241333),
 ('亞太的', 0.5132642984390259),
 ('電信三雄', 0.501976728439331),
 ('iPhone12', 0.5016946792602539),
 ('和遠傳', 0.500913679599762)]

In [24]:
model.wv.get_vector("123123")

array([ 0.04915698,  0.45745426,  0.41741407,  0.1977035 ,  0.07172263,
       -0.5270766 ,  0.12231938, -0.01552046, -0.18223609, -0.0202192 ,
       -0.03669434, -0.43246984, -0.35885167, -0.1331142 ,  0.17527956,
       -0.11173597,  0.28620502,  0.02082222,  0.01726513,  0.05518986,
       -0.2388872 ,  0.2915667 ,  0.2661886 ,  0.15432915,  0.33016172,
        0.6088445 ,  0.08653054,  0.133386  , -0.00344209, -0.03395419,
        0.01283646, -0.30415154, -0.76913995, -0.16977854,  0.10662071,
        0.24383529,  0.18520495,  0.05134694, -0.21708278,  0.12188572,
        0.1289382 ,  0.08620791,  0.1799437 ,  0.19648352, -0.11858976,
       -0.20809227, -0.3062387 , -0.11886251, -0.22456175, -0.2663437 ,
        0.22548397,  0.11629876, -0.1063494 , -0.00578792,  0.01329483,
       -0.02728292,  0.20289893,  0.07384316,  0.16674879,  0.12767471,
       -0.29119405, -0.18927856,  0.00709701,  0.21857251, -0.31414282,
       -0.1892137 ,  0.14208838, -0.08353277, -0.09640163,  0.15

In [20]:
len(model.wv.vectors)

5885

In [36]:
model.wv.save("../../data/eda/spm/sp_vectors.model")

[INFO] 2020-11-24 15:53:01,713 gensim.utils: saving FastText object under ../../data/eda/spm/sp_vectors.model, separately None
[INFO] 2020-11-24 15:53:01,715 gensim.utils: storing np array 'vectors_ngrams' to ../../data/eda/spm/sp_vectors.model.wv.vectors_ngrams.npy
[INFO] 2020-11-24 15:53:09,675 gensim.utils: not storing attribute vectors_ngrams_norm
[INFO] 2020-11-24 15:53:09,676 gensim.utils: not storing attribute vectors_norm
[INFO] 2020-11-24 15:53:09,677 gensim.utils: not storing attribute vectors_vocab_norm
[INFO] 2020-11-24 15:53:09,677 gensim.utils: not storing attribute buckets_word
[INFO] 2020-11-24 15:53:09,678 gensim.utils: storing np array 'vectors_ngrams_lockf' to ../../data/eda/spm/sp_vectors.model.trainables.vectors_ngrams_lockf.npy
[INFO] 2020-11-24 15:53:18,839 gensim.utils: saved ../../data/eda/spm/sp_vectors.model


In [38]:
model = FastText.load("../../data/eda/spm/sp_vectors.model")

[INFO] 2020-11-24 16:15:20,954 gensim.utils: loading FastText object from ../../data/eda/spm/sp_vectors.model
[INFO] 2020-11-24 16:15:21,049 gensim.utils: loading wv recursively from ../../data/eda/spm/sp_vectors.model.wv.* with mmap=None
[INFO] 2020-11-24 16:15:21,050 gensim.utils: loading vectors_ngrams from ../../data/eda/spm/sp_vectors.model.wv.vectors_ngrams.npy with mmap=None
[INFO] 2020-11-24 16:15:21,612 gensim.utils: setting ignored attribute vectors_ngrams_norm to None
[INFO] 2020-11-24 16:15:21,613 gensim.utils: setting ignored attribute vectors_norm to None
[INFO] 2020-11-24 16:15:21,614 gensim.utils: setting ignored attribute vectors_vocab_norm to None
[INFO] 2020-11-24 16:15:21,614 gensim.utils: setting ignored attribute buckets_word to None
[INFO] 2020-11-24 16:15:21,615 gensim.utils: loading vocabulary recursively from ../../data/eda/spm/sp_vectors.model.vocabulary.* with mmap=None
[INFO] 2020-11-24 16:15:21,615 gensim.utils: loading trainables recursively from ../../da