In [1]:
from pythainlp import word_tokenize
from pythainlp.corpus import thai_stopwords
STOPWORDS = thai_stopwords()
from glob import glob
from tqdm import tqdm
import json, html, re
import numpy as np
import pandas as pd
    
def clean(text, hashtag):
    text = text.replace(hashtag, '')
    text = html.unescape(text)
    text = re.sub(r'http.+?(?:\\s|$)', '', text) # URL link
    text = re.sub(r'[‚Äú‚Äù‚Äû]', '"', text) # convert double quotations into "
    text = re.sub(r'[‚Äò‚Äô‚Ä≤‚Ä≤‚Ä≤‚Ä≤`]', "'", text) # convert single quotations into '
    text = re.sub(r'[ \u00a0\xa0\u3000\u2002-\u200a\t\n#]+', ' ', text) # shrink whitespaces e.g. good  boy -> good boy
    text = re.sub(r'[\r\u200b\ufeff]+', '', text) # remove non-breaking space
    text = re.sub(r'‡πç‡∏≤','‡∏≥', text) # am
    return text.strip()

def tokenize(text, hashtag):
    tokens = word_tokenize(clean(text, hashtag), keep_whitespace='False')
    tokens = [token for token in tokens if token not in STOPWORDS and re.match(r'[‡∏Å-‡πô][‡∏Å-‡πô\\.\\-]+$', token)]
    return tokens

In [2]:
jsons = glob('tweets/*.json')
for i, f in enumerate(jsons):
    print(i,f)

0 tweets/#‡∏Å‡∏π‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ï‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç.json
1 tweets/#‡∏ò‡∏£‡∏£‡∏°‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏ä‡∏∏‡∏°‡∏ô‡∏∏‡∏°.json
2 tweets/#‡πÄ‡∏Å‡∏µ‡∏¢‡∏°‡∏≠‡∏∏‡∏î‡∏°‡πÑ‡∏°‡πà‡∏Å‡πâ‡∏°‡∏´‡∏±‡∏ß‡πÉ‡∏´‡πâ‡πÄ‡∏ú‡∏î‡πá‡∏à‡∏Å‡∏≤‡∏£.json
3 tweets/#25‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡πÑ‡∏õSCB.json
4 tweets/#‡∏°‡πá‡∏≠‡∏ö25‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡∏ó‡∏ß‡∏á‡∏Ñ‡∏∑‡∏ô‡∏™‡∏°‡∏ö‡∏±‡∏ï‡∏¥‡∏ä‡∏≤‡∏ï‡∏¥.json
5 tweets/#‡∏°‡πá‡∏≠‡∏ö2‡∏ò‡∏±‡∏ô‡∏ß‡∏≤.json


In [15]:
%%time
hashtag = "#‡∏Å‡∏π‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ï‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç"

df = pd.read_json(jsons[0], lines=True)
df['tokens'] = df.tweet.apply(lambda x: tokenize(x, hashtag))

df.tokens

CPU times: user 9.27 s, sys: 1.2 s, total: 10.5 s
Wall time: 12.1 s


0        [‡∏ú‡∏ô‡∏á‡∏£‡∏à‡∏ï‡∏Å‡∏°, ‡∏´‡∏±‡∏ß, ‡∏Ç‡∏µ‡πà, ‡πÄ‡∏•‡∏∑‡πà‡∏≠‡∏¢, ‡∏õ‡∏±‡∏ç‡∏ç‡∏≤, ‡∏ö‡∏£‡∏¥‡∏´‡∏≤‡∏£, ‡∏õ‡∏£...
1        [‡∏≠‡∏¥‡∏™‡∏±‡∏™, ‡∏Ç‡πâ‡∏≠‡∏≠‡πâ‡∏≤‡∏á, ‡∏£‡∏±‡∏ê‡∏õ‡∏£‡∏∞‡∏´‡∏≤‡∏£, ‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ, ‡∏à‡∏±‡∏ç‡πÑ‡∏£, ‡∏î‡∏µ‡∏Å...
2        [‡∏£‡∏±‡∏ê, ‡∏ó‡∏≥, ‡∏õ‡∏£‡∏∞‡∏ä‡∏≤‡∏ä‡∏ô, ‡πÅ‡∏ö‡∏ö‡∏ô‡∏µ‡πâ, ‡∏™‡∏°‡∏Ñ‡∏ß‡∏£, ‡πÑ‡∏≠‡πâ, ‡∏™‡∏•‡∏¥‡πà‡∏°, ...
3        [‡∏£‡∏î, ‡πÄ‡∏¢‡∏µ‡πà‡∏¢‡∏ß, ‡∏™‡∏¥, ‡πÇ‡∏Ñ‡∏ß‡∏¥‡∏î, ‡πÇ‡∏Ñ‡∏ß‡∏¥‡∏î, ‡∏ä‡∏±‡∏Å‡∏ß‡πà‡∏≤‡∏ß, ‡∏Ç‡πà‡∏≤‡∏ß, ...
4            [‡∏Ç‡∏∏‡∏î, ‡∏•‡∏∂‡∏Å, ‡πÄ‡∏´‡∏µ‡πâ‡∏¢, ‡∏Ç‡πâ‡∏≤‡∏£‡∏≤‡∏ä‡∏Å‡∏≤‡∏£, ‡πÑ‡∏ó‡∏¢, ‡∏ö‡πà‡∏≠, ‡∏ö‡∏≤‡∏î‡∏≤‡∏•]
                               ...                        
19060    [‡∏≠‡∏µ, ‡πÅ‡∏õ‡∏•, ‡∏õ‡∏è‡∏¥‡∏£‡∏π‡∏õ, ‡πÅ‡∏ü‡∏ô, ‡∏°‡∏µ‡∏ï, ‡∏≠‡∏µ, ‡πÑ‡∏û‡∏£‡πà, ‡πÅ‡∏õ‡∏•, ‡∏õ‡∏è‡∏¥...
19061      [‡πÑ‡∏≠‡πâ, ‡∏°‡∏∑‡∏≠‡∏õ‡∏£‡∏≤‡∏ö, ‡∏´‡∏°‡∏π, ‡∏Å‡∏∞‡∏ó‡∏∞, ‡πÄ‡∏£‡∏∑‡∏≠‡πÄ‡∏õ‡πá‡∏î, ‡∏Å‡∏∏‡πâ‡∏á, ‡∏¢‡πà‡∏≤‡∏á]
19062                                                   []
19063                  

In [16]:
df.tokens.apply(len).mean()

8.542669813794912

# LDA

In [17]:
%%time

from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary

corpus_dictionary = Dictionary(df.tokens)
corpus = []
for tokens_list in tqdm(df.tokens):
    corpus.append(corpus_dictionary.doc2bow(tokens_list))
lda = LdaModel(corpus, num_topics=5, id2word=corpus_dictionary, passes=10)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 19065/19065 [00:00<00:00, 73441.08it/s]


CPU times: user 1min 8s, sys: 525 ms, total: 1min 9s
Wall time: 1min 19s


In [18]:
print("|rank|topic 1||topic 2||topic 3||topic 4||topic 5||\n|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|")
for i in range(10):
    print(f'|{i+1}', end='')
    for j in range(5):
        word, score = lda.show_topic(j)[i]
        print(f"|{word}|{score:.3f}",end="")
    print('|')

|rank|topic 1||topic 2||topic 3||topic 4||topic 5||
|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|
|1|‡∏°‡πá‡∏≠‡∏ö|0.060|‡πÄ‡∏´‡∏µ‡πâ‡∏¢|0.052|‡∏°‡πá‡∏≠‡∏ö|0.224|‡∏°‡πá‡∏≠‡∏ö|0.087|‡∏°‡πá‡∏≠‡∏ö|0.079|
|2|‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤|0.052|‡∏Ñ‡∏ô|0.031|‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤|0.215|‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤|0.083|‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤|0.077|
|3|‡∏ô‡πâ‡∏≥|0.037|‡∏ó‡∏≥|0.030|‡∏õ‡∏£‡∏∞‡∏ä‡∏≤‡∏ä‡∏ô|0.068|‡∏ó‡∏≤‡∏á‡∏≠‡∏≠‡∏Å|0.076|‡∏ô‡∏∞‡∏Ñ‡∏∞|0.024|
|4|‡∏â‡∏µ‡∏î|0.029|‡∏≠‡∏µ|0.027|‡∏´‡∏¢‡∏∏‡∏î|0.026|‡∏ó‡∏∏|0.076|‡∏£‡∏µ|0.016|
|5|‡∏Å‡∏£‡∏∞‡∏™‡∏∏‡∏ô|0.024|‡∏´‡∏£‡∏≠|0.016|‡∏Ñ‡∏∏‡∏Å‡∏Ñ‡∏≤‡∏°|0.019|‡∏Å‡∏£‡πà‡∏≤‡∏á|0.076|‡∏≠‡∏¢‡πà‡∏≤|0.015|
|6|‡∏¢‡∏≤‡∏á|0.022|‡πÑ‡∏≠‡πâ|0.016|‡πÄ‡∏ú‡∏î‡πá‡∏à‡∏Å‡∏≤‡∏£|0.019|‡∏™‡∏†‡∏≤|0.057|‡∏ä‡πà‡∏ß‡∏¢‡∏Å‡∏±‡∏ô|0.014|
|7|‡∏ä‡∏∏‡∏°‡∏ô‡∏∏‡∏°|0.018|‡∏û‡πà‡∏≠|0.012|‡∏ï‡∏≥‡∏£‡∏ß‡∏à|0.018|‡∏õ‡∏£‡∏∞‡∏ä‡∏∏‡∏°|0.052|‡πÅ‡∏ö‡∏ô|0.014|
|8|‡πÉ‡∏™‡πà|0.015|‡∏£‡∏±‡∏Å|0.011|‡∏Ç‡∏µ‡πâ‡∏Ç‡πâ‡∏≤|0.018|‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç|0.048|‡∏Ç‡πà‡∏≤‡∏ß|0.011|
|9|‡πÅ‡∏Å‡πä‡∏™‡∏ô‡πâ‡∏≥‡∏ï‡∏≤|0.013|‡∏≠‡

# TF-IDF vectorization + SVD + K-means

In [19]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

### prepare data : list of sentences (delimitered by space) -> TF-IDF vectorization
corpus = df.tokens.apply(lambda x: ' '.join(x))
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = vectorizer.fit_transform(corpus)
print(X.shape)

### singular value decomposition & normalize
dim = 50

svd = TruncatedSVD(n_components=dim, n_iter=7, random_state=42)
X = svd.fit_transform(X)
normalized = [vec/np.linalg.norm(vec) if np.linalg.norm(vec) != 0 else np.zeros(dim) for vec in X ]

(19065, 20000)
CPU times: user 3.22 s, sys: 340 ms, total: 3.56 s
Wall time: 2.37 s


In [21]:
%%time

### clustering 
num = 5

result = KMeans(n_clusters=num).fit_predict(normalized)

CPU times: user 2.41 s, sys: 516 ms, total: 2.93 s
Wall time: 878 ms


In [22]:
print("|tweet|topic|\n|:-:|:-:|")
for _ in range(30):
    i = np.random.randint(0, len(X))
    print(f"|{df.tweet[i]}|{result[i]+1}|")

|tweet|topic|
|:-:|:-:|
|‡πÑ‡∏≠‡∏û‡∏ß‡∏Å‡πÄ‡∏´‡∏µ‡πâ‡∏¢ ‡πÑ‡∏≠‡∏û‡∏ß‡∏Å‡πÄ‡∏´‡∏µ‡πâ‡∏¢‡∏¢‡∏¢ ‡πÇ‡∏°‡πÇ‡∏´‡∏°‡∏≤‡∏Å ‡πÑ‡∏î‡πâ‡πÄ‡∏á‡∏¥‡∏ô‡πÄ‡∏ó‡πà‡∏≤‡πÑ‡∏´‡∏£‡πà‡∏´‡πâ‡∏∞ ‡πÄ‡∏Ñ‡πâ‡∏≤‡∏à‡πâ‡∏≤‡∏á‡∏û‡∏ß‡∏Å‡∏°‡∏∂‡∏á‡∏°‡∏≤‡πÄ‡∏ó‡πà‡∏≤‡πÑ‡∏´‡∏£‡πà #‡∏°‡πá‡∏≠‡∏ö17‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤ #‡∏Å‡∏π‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ï‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç|1|
|‡∏ô‡∏µ‡πâ‡∏™‡∏¥.. ‡∏Ñ‡∏ô‡∏ö‡∏ô‡∏ü‡πâ‡∏≤‚Äã #‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®‡∏Å‡∏π‡∏°‡∏µ #‡∏Å‡∏π‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ï‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç|5|
|‡πÄ‡∏≠‡∏≤‡∏≠‡∏µ‡∏Å‡πÅ‡∏•‡πâ‡∏ß@amarintvhd ‡∏Å‡∏•‡πâ‡∏≤‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡∏ï‡∏±‡∏ß‡πÄ‡∏≠‡∏á‡∏ß‡πà‡∏≤‡∏™‡∏∑‡πà‡∏≠‡πÑ‡∏î‡πâ‡πÑ‡∏á‡∏≠‡πà‡∏∞ ‡∏ó‡∏µ‡πà‡∏î‡πà‡∏≤‡πÜ‡πÅ‡∏•‡∏∞‡πÅ‡∏ó‡πá‡∏Å‡πÑ‡∏õ‡∏Ñ‡∏∑‡∏≠‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ‡πÉ‡∏´‡πâ‡πÄ‡∏ã‡∏£‡∏µ‡∏ö‡∏£‡∏±‡∏°‡πÉ‡∏ô‡∏´‡∏±‡∏ß‡∏™‡∏°‡∏≠‡∏á‡πÑ‡∏î‡πâ‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡∏ö‡πâ‡∏≤‡∏á‡πÄ‡∏•‡∏¢‡∏´‡∏£‡∏≠ #‡∏°‡πá‡∏≠‡∏ö17‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤  #‡∏Å‡∏π‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ï‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç|5|
|#‡∏°‡