In [1]:
from pythainlp import word_tokenize
from pythainlp.corpus import thai_stopwords
STOPWORDS = thai_stopwords()
from glob import glob
from tqdm import tqdm
import json, html, re
import numpy as np
import pandas as pd
    
def clean(text, hashtag):
    text = text.replace(hashtag, '')
    text = html.unescape(text)
    text = re.sub(r'http.+?(?:\\s|$)', '', text) # URL link
    text = re.sub(r'[‚Äú‚Äù‚Äû]', '"', text) # convert double quotations into "
    text = re.sub(r'[‚Äò‚Äô‚Ä≤‚Ä≤‚Ä≤‚Ä≤`]', "'", text) # convert single quotations into '
    text = re.sub(r'[ \u00a0\xa0\u3000\u2002-\u200a\t\n#]+', ' ', text) # shrink whitespaces e.g. good  boy -> good boy
    text = re.sub(r'[\r\u200b\ufeff]+', '', text) # remove non-breaking space
    text = re.sub(r'‡πç‡∏≤','‡∏≥', text) # am
    return text.strip()

def tokenize(text, hashtag):
    tokens = word_tokenize(clean(text, hashtag), keep_whitespace='False')
    tokens = [token for token in tokens if token not in STOPWORDS and re.match(r'[‡∏Å-‡πô][‡∏Å-‡πô\\.\\-]+$', token)]
    return tokens

In [2]:
jsons = glob('tweets/*.json')
for i, f in enumerate(jsons):
    print(i,f)

0 tweets/#‡∏Å‡∏π‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ï‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç.json
1 tweets/#16‡∏ï‡∏∏‡∏•‡∏≤‡πÑ‡∏õ‡πÅ‡∏¢‡∏Å‡∏õ‡∏ó‡∏∏‡∏°‡∏ß‡∏±‡∏ô.json
2 tweets/#save‡∏ß‡∏±‡∏ô‡πÄ‡∏â‡∏•‡∏¥‡∏°.json
3 tweets/#‡∏ò‡∏£‡∏£‡∏°‡∏®‡∏≤‡∏™‡∏ï‡∏£‡πå‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏ä‡∏∏‡∏°‡∏ô‡∏∏‡∏°.json
4 tweets/#‡πÄ‡∏Å‡∏µ‡∏¢‡∏°‡∏≠‡∏∏‡∏î‡∏°‡πÑ‡∏°‡πà‡∏Å‡πâ‡∏°‡∏´‡∏±‡∏ß‡πÉ‡∏´‡πâ‡πÄ‡∏ú‡∏î‡πá‡∏à‡∏Å‡∏≤‡∏£.json
5 tweets/#25‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡πÑ‡∏õSCB.json
6 tweets/#‡∏°‡πá‡∏≠‡∏ö25‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡∏ó‡∏ß‡∏á‡∏Ñ‡∏∑‡∏ô‡∏™‡∏°‡∏ö‡∏±‡∏ï‡∏¥‡∏ä‡∏≤‡∏ï‡∏¥.json
7 tweets/#‡∏°‡πá‡∏≠‡∏ö2‡∏ò‡∏±‡∏ô‡∏ß‡∏≤.json


In [3]:
%%time
hashtag = "#25‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡πÑ‡∏õSCB"

df = pd.read_json(jsons[5], lines=True)
df['tokens'] = df.tweet.apply(lambda x: tokenize(x, hashtag))

df.tokens

CPU times: user 25 s, sys: 769 ms, total: 25.8 s
Wall time: 26.7 s


0        [‡∏ä‡∏ß‡∏ô, ‡∏ó‡∏ö‡∏ó‡∏ß‡∏ô, ‡∏°‡πá‡∏≠‡∏ö, ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤, ‡∏õ‡∏£‡∏∞‡πÄ‡∏î‡πá‡∏ô, ‡∏ó‡∏£‡∏±‡∏û‡∏¢‡πå‡∏™‡∏¥‡∏ô...
1        [‡∏ß‡∏≤‡∏ô, ‡πÅ‡∏≠‡∏°‡∏õ‡πå, ‡∏ß‡∏£‡∏£‡∏©, ‡∏£‡∏±‡∏ö‡∏ó‡∏£‡∏≤‡∏ö, ‡∏Ç‡πâ‡∏≠‡∏´‡∏≤, ‡∏£‡πâ‡∏≠‡∏á‡πÄ‡∏û‡∏•‡∏á, ‡∏â...
2        [‡∏ß‡∏£‡∏£‡∏©, ‡πÄ‡∏•‡∏µ‡πâ‡∏¢‡∏á, ‡∏ß‡∏±‡∏í‡∏ô‡∏≤, ‡∏ö‡∏±‡∏ì‡∏ë‡∏¥‡∏ï, ‡∏°., ‡∏®‡∏¥‡∏•‡∏õ‡∏≤‡∏Å‡∏£, ‡∏´‡∏°‡∏≤...
3                                                       []
4                                                       []
                               ...                        
49045                                                   []
49046    [‡∏°‡πá‡∏≠‡∏ö, ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤, ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÑ‡∏õ, ‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡πÉ‡∏´‡∏ç‡πà, ‡∏•‡∏î, ‡∏Å...
49047                                                [‡∏Ñ‡∏±‡∏ö]
49048    [‡∏°‡πá‡∏≠‡∏ö, ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤, ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÑ‡∏õ, ‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡πÉ‡∏´‡∏ç‡πà, ‡∏•‡∏î, ‡∏Å...
49049    [‡∏°‡πá‡∏≠‡∏ö, ‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤, ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏

In [55]:
df.tokens.apply(len).mean()

8.79480122324159

# LDA

In [56]:
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary

corpus_dictionary = Dictionary(df.tokens)
corpus = []
for tokens_list in tqdm(df.tokens):
    corpus.append(corpus_dictionary.doc2bow(tokens_list))
lda = LdaModel(corpus, num_topics=5, id2word=corpus_dictionary, passes=10)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 49050/49050 [00:00<00:00, 63046.94it/s]


In [57]:
for i in range(10):
    print(f'|{i+1}', end='')
    for j in range(5):
        word, score = lda.show_topic(j)[i]
        print(f"|{word}|{score:.3f}",end="")
    print('|')

|1|‡∏ó‡∏≥|0.029|‡∏ä‡∏∏‡∏°‡∏ô‡∏∏‡∏°|0.019|‡∏ô‡∏∞‡∏Ñ‡∏∞|0.049|‡∏°‡πá‡∏≠‡∏ö|0.148|‡∏ï‡∏≥‡∏£‡∏ß‡∏à|0.034|
|2|‡∏≠‡∏µ|0.023|‡∏Ç‡πà‡∏≤‡∏ß|0.017|‡∏Ñ‡∏ô|0.040|‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤|0.101|‡∏õ‡∏£‡∏∞‡∏ä‡∏≤‡∏ä‡∏ô|0.027|
|3|‡∏ï‡∏≥‡∏£‡∏ß‡∏à|0.023|‡∏£‡∏≤‡∏©‡∏é‡∏£|0.017|‡∏≠‡∏¢‡πà‡∏≤|0.029|‡∏£‡∏±‡∏ê‡∏õ‡∏£‡∏∞‡∏´‡∏≤‡∏£|0.023|‡∏õ‡∏£‡∏∞‡πÄ‡∏ó‡∏®|0.022|
|4|‡∏Ñ‡∏ô|0.022|‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®|0.016|‡πÅ‡∏Å‡∏á|0.028|‡∏ï‡πà‡∏≠‡∏ï‡πâ‡∏≤‡∏ô|0.019|‡πÑ‡∏ó‡∏¢|0.018|
|5|‡∏Ç‡∏ô‡∏≤‡∏î|0.018|‡∏ó‡∏£‡∏±‡∏û‡∏¢‡πå‡∏™‡∏¥‡∏ô|0.014|‡πÇ‡∏î‡∏ô|0.020|‡∏£‡∏ñ‡∏ï‡∏¥‡∏î|0.016|‡∏™‡∏≥‡∏ô‡∏±‡∏Å‡∏á‡∏≤‡∏ô‡πÉ‡∏´‡∏ç‡πà|0.016|
|6|‡πÄ‡∏´‡∏µ‡πâ‡∏¢|0.016|‡πÅ‡∏Å‡∏á|0.013|‡∏ó‡∏≥|0.011|‡∏ä‡∏∏‡∏°‡∏ô‡∏∏‡∏°|0.015|‡∏ó‡∏´‡∏≤‡∏£|0.016|
|7|‡∏Å‡∏•‡∏±‡∏ß|0.016|‡πÅ‡∏ó‡πá‡∏Å|0.013|‡∏Ç‡∏≠‡πÉ‡∏´‡πâ|0.010|‡∏ï‡∏π‡πâ|0.010|‡∏´‡∏±‡∏ß|0.015|
|8|‡∏≠‡πà‡∏∞|0.014|‡∏û‡∏£‡∏∏‡πà‡∏á‡∏ô‡∏µ‡πâ|0.012|‡∏™‡∏π‡πâ|0.009|‡∏ñ‡∏ô‡∏ô|0.010|‡πÄ‡∏Å‡∏£‡∏µ‡∏¢‡∏ô|0.012|
|9|‡πÅ‡∏°‡πà|0.014|‡∏¢‡∏Å‡πÄ‡∏•‡∏¥‡∏Å|0.011|‡πÉ‡∏™‡πà|0.009|‡∏Ñ‡∏≠‡∏ô‡πÄ‡∏ó‡∏ô‡πÄ‡∏ô‡∏≠‡∏£‡πå|0.008|‡∏†‡∏≤‡∏©‡∏µ|0.010|
|10|

# TF-IDF vectorization + SVD + K-means

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

### prepare data : list of sentences (delimitered by space) -> TF-IDF vectorization
corpus = df.tokens.apply(lambda x: ' '.join(x))
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = vectorizer.fit_transform(corpus)
print(X.shape)

### singular value decomposition & normalize
dim = 50

svd = TruncatedSVD(n_components=dim, n_iter=7, random_state=42)
X = svd.fit_transform(X)
normalized = [vec/np.linalg.norm(vec) if np.linalg.norm(vec) != 0 else np.zeros(dim) for vec in X ]

(49050, 20000)


In [15]:
### clustering 
num = 5

result = KMeans(n_clusters=num).fit_predict(normalized)

In [16]:
print("|tweet|topic|\n|:-:|:-:|")
for _ in range(30):
    i = np.random.randint(0, len(X))
    print(f"|{df.tweet[i]}|{result[i]+1}|")

|tweet|topic|
|:-:|:-:|
|‡∏ó‡∏∏‡∏Å‡∏Ñ‡∏ô‡πÉ‡∏Ñ‡∏£‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏£‡∏µ‡∏ö‡∏Å‡∏•‡∏±‡∏ö‡∏ö‡πâ‡∏≤‡∏ô‡πÄ‡∏•‡∏¢‡∏ô‡∏∞ ‡πÄ‡∏î‡∏µ‡πã‡∏¢‡∏ß‡∏û‡∏ß‡∏Å‡∏°‡∏±‡∏ô‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏™‡∏ñ‡∏≤‡∏ô‡∏∞‡∏Å‡∏≤‡∏£‡∏ì‡πå‡∏≠‡∏µ‡∏Å #25‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡πÑ‡∏õSCB|4|
|‡∏ï‡∏£.‡∏à‡πà‡∏≠‡∏´‡∏°‡∏≤‡∏¢‡∏à‡∏±‡∏ö‡∏≠‡∏≤‡∏ä‡∏µ‡∏ß‡∏∞‡∏°‡∏∑‡∏≠‡∏õ‡∏∑‡∏ô‡∏õ‡πà‡∏ß‡∏ô‡∏°‡πá‡∏≠‡∏ö ‡∏¢‡∏±‡∏ô‡∏¢‡∏¥‡∏á‡∏Å‡∏±‡∏ô‡πÄ‡∏≠‡∏á-‡∏õ‡∏°‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á‡∏™‡πà‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡πÑ‡∏°‡πà‡πÄ‡∏Å‡∏µ‡πà‡∏¢‡∏ß‡∏ä‡∏∏‡∏°‡∏ô‡∏∏‡∏°  https://t.co/jG6TIc1t4T #25‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤‡πÑ‡∏õSCB #‡∏ó‡πç‡∏≤‡πÉ‡∏ô‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡πç‡∏≤‡πÉ‡∏ô‡∏™‡∏¥‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ñ‡∏π‡∏Å #‡∏õ‡∏£‡∏∞‡∏ä‡∏∏‡∏°‡∏™‡∏†‡∏≤ #‡∏°‡πá‡∏≠‡∏ö26‡∏û‡∏§‡∏®‡∏à‡∏¥‡∏Å‡∏≤ #‡∏°‡∏∑‡∏≠‡∏•‡∏±‡πà‡∏ô‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô‡∏û‡πà‡∏≠‡∏°‡∏∂‡∏á‡πÄ‡∏•‡∏¢ #‡∏Å‡∏π‡∏™‡∏±‡πà‡∏á‡πÉ‡∏´‡πâ‡∏°‡∏∂‡∏á‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ï‡πâ‡∏£‡∏±‡∏ê‡∏ò‡∏£‡∏£‡∏°‡∏ô‡∏π‡∏ç #‡∏™‡πà‡∏á‡∏ï‡πà‡∏≠‡∏Ñ‡∏™‡∏≠ #‡∏ï‡∏•‡∏≤‡∏î‡∏ô‡∏±‡∏îtreasure #‡∏ï‡πç‡∏≤‡∏£‡∏ß‡∏à‡πÑ‡∏ó‡∏¢‡∏Ç‡∏¢‡∏∞‡∏™‡∏±‡∏á‡∏Ñ‡∏°|3|
|LIVE! ‡∏Å‡∏≤‡∏£‡∏ä‡∏∏‡∏°‡∏ô‡∏∏‡∏

In [None]:
for normalized