# Model Comparison
---

1. Data Preparation
2. Model Comparison
- K-mean
- LDA
- NMF
- Top2Vec
- BERTopic

## 0. Import Libraries

In [1]:
# Import Basic Libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Import Sklearn Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import NMF

# Import NLP Libraries
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
from pythainlp.tokenize import sent_tokenize, word_tokenize
from pythainlp.corpus import thai_stopwords
import re
from tqdm import tqdm 
import pyLDAvis
import pyLDAvis.gensim_models


# Set default Thai font
mpl.font_manager.fontManager.addfont('./THSarabunNew/THSarabunNew.ttf')
mpl.rc('font', family='TH Sarabun New', size=20)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

  from imp import reload


## 1. Data Preparation

### 1.1 Topic Selection

In [11]:
econ_b = pd.read_json('../datasets/เศรษฐกิจ_processed.json')
econ_m = pd.read_json('../datasets/economy_processed.json')

### 1.2 Bag-of-Words

In [12]:
# Finction to store n_word in dict
def featurize(token_list):
    token_list=token_list
    features = {}
    for token in token_list:
        features[token]=1
    return features

In [14]:
econ_b_bow = econ_b['article_tokenize'].apply(featurize)
econ_m_bow = econ_m['article_tokenize'].apply(featurize)

In [23]:
econ_b_bow.shape, econ_m_bow.shape

((6801,), (5884,))

In [46]:
vectorizer_1 = DictVectorizer(sparse=True)
econ_b_vec = vectorizer_1.fit_transform(econ_b_bow)

vectorizer_2 = DictVectorizer(sparse=True)
econ_m_vec = vectorizer_2.fit_transform(econ_m_bow)

In [47]:
econ_b_vec, econ_m_vec

(<6801x35157 sparse matrix of type '<class 'numpy.float64'>'
 	with 844532 stored elements in Compressed Sparse Row format>,
 <5884x40835 sparse matrix of type '<class 'numpy.float64'>'
 	with 891436 stored elements in Compressed Sparse Row format>)

## 2. Model Comparison

### 2.1 K-Mean

### 2.2 LDA

In [None]:
# Function for modeling with LDA
def lda_model(data=None, num_topics=None): #, topicid=None
    
    dictionary = Dictionary(data)
    corpus = [dictionary.doc2bow(txt) for txt in data]
    
    model = LdaModel(corpus=corpus, num_topics=num_topics)
    #topic = pd.DataFrame(model.get_topic_terms(topicid=1, topn=20)).rename(columns={0:'index', 1:'probability'})
    for i in range(num_topics):
        top_n = [dictionary[index] for index, prob in model.get_topic_terms(topicid=i, topn=30)]
        print(f'Topic {i+1}')
        print(top_n)
        print('-'*60)
    return model

In [None]:
# Visualization
def lda_vis(data=None, num_topics=20):
    dictionary = Dictionary(data)
    corpus = [dictionary.doc2bow(txt) for txt in data]
    model = LdaModel(corpus=corpus, num_topics=num_topics)
    pyLDAvis.enable_notebook()
    viz = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
    return viz

In [None]:
lda_10b = lda_model(data=econ_b['article_tokenize'], num_topics=10)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=10)
# Topic overlaps

In [None]:
lda_7b = lda_model(data=econ_b['article_tokenize'], num_topics=7)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=7)
# Topic overlap to the low right

In [None]:
lda_15b = lda_model(data=econ_b['article_tokenize'], num_topics=15)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=15)
# To the left with one to the right

In [None]:
lda_30b = lda_model(data=econ_b['article_tokenize'], num_topics=30)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=15)

In [None]:
lda_10m = lda_model(data=econ_m['article_tokenize'], num_topics=10)

In [None]:
#overlap กันสูง
#lda_vis(data=econ_m['article_tokenize'], num_topics=10)

In [None]:
lda_7m = lda_model(data=econ_m['article_tokenize'], num_topics=7)

In [None]:
#lda_vis(data=econ_m['article_tokenize'], num_topics=7)

### 2.3 NMF

In [48]:
#data = econ_b['article_tokenize'].apply(lambda x:' '.join(x))
#cvec = CountVectorizer(token_pattern= "\b[A-zก-๙][A-z\.\-ก-๙]*\b")
#data = cvec.fit_transform(data)

In [49]:
nmf = NMF(n_components=10, random_state=42)
nmf.fit(econ_b_vec)

NMF(n_components=10, random_state=42)

In [50]:
nmf_features = nmf.transform(econ_b_vec)
nmf_features.shape

(6801, 10)

In [51]:
nmf.components_.shape

(10, 35157)

In [61]:
econ_b_components = pd.DataFrame(nmf.components_, columns=vectorizer_1.get_feature_names())
econ_b_components 

Unnamed: 0,0,0.0,0.00,0.001,0.003,0.005,0.0098,0.01,0.010,0.014,...,์เบิต,์เบิร์ก,์เวง,์เฮ้าส์,ํ่า,ํ้า,๒,๒๐๑๙,๒๕๔๑,๒๕๖๓
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.013844,0.0,0.0,0.000491,4.5e-05
1,0.0,0.0,0.0,0.0,0.001582,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000809,0.0,0.0,0.0,0.0
2,0.107089,0.005027,0.002787,0.000134,0.0,0.000134,0.15644,0.025602,0.0,0.0,...,0.000928,0.0,0.0,0.002389,0.003974,0.0,0.0,0.0,0.0,0.0
3,0.031088,0.0,0.0,0.0,0.000108,0.0,0.0,0.005034,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.2e-05
4,0.051447,0.0,0.0,0.000278,0.000473,0.000278,0.09748,0.023438,0.000862,0.0,...,0.0,0.000782,0.001125,0.0,0.0,0.0,0.001757,0.001757,0.00025,0.002077
5,0.055716,0.0,0.0,0.003978,0.0,0.003978,0.0,0.004295,0.00339,0.003535,...,0.000312,0.000488,0.003625,0.002073,0.0,0.0,0.0,0.0,0.0,0.0
6,0.01238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.004262,0.000691,0.012437,0.0,0.0,0.0,0.063902,0.0,0.000538,0.001078,...,0.0,0.0,0.0,0.0,0.0,0.00051,0.0,0.0,0.0,0.0
8,0.041804,0.0,0.0,0.0,0.000662,0.0,0.014382,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000489,0.0,0.0006,0.00022,0.00022,0.0,7.6e-05
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
# Topic & Words
for topic in range(10):
    topic = econ_b_components.iloc[topic]
    print(type(topic))
    print(f'For topic {topic+1} the words with the highest value are:')
    print(topic.nlargest(10))
    print('\n')

<class 'pandas.core.series.Series'>
For topic 0        1.000000
0.0      1.000000
0.00     1.000000
0.001    1.000000
0.003    1.000000
           ...   
ํ้า      1.013844
๒        1.000000
๒๐๑๙     1.000000
๒๕๔๑     1.000491
๒๕๖๓     1.000045
Name: 0, Length: 35157, dtype: float64 the words with the highest value are:
พัฒนา       2.926964
โครงการ     2.557163
การพัฒนา    2.390126
พื้นที่     2.142602
สร้าง       2.046783
ปี          1.981855
การลงทุน    1.735550
ส่งเสริม    1.728226
ระบบ        1.692924
เขต         1.596687
Name: 0, dtype: float64


<class 'pandas.core.series.Series'>
For topic 0        1.000000
0.0      1.000000
0.00     1.000000
0.001    1.000000
0.003    1.001582
           ...   
ํ้า      1.000809
๒        1.000000
๒๐๑๙     1.000000
๒๕๔๑     1.000000
๒๕๖๓     1.000000
Name: 1, Length: 35157, dtype: float64 the words with the highest value are:
กรมอุตุนิยมวิทยา    3.343484
พยากรณ์อากาศ        3.337378
ฝนตก                3.328116
ไท                  3.287378
หนัก  

### 2.4 Top2Vec

### 2.5 BERTopic