# Digital
---

1. Data Preparation
2. Model Comparison
- K-mean
- LDA
- NMF
- Top2Vec > still in progress and will continue if time permits
- BERTopic > give up BERTopic because of time constraints

## 0. Import Libraries

In [1]:
# Import Basic Libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Import Sklearn Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans


# Import NLP Libraries
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
from pythainlp.tokenize import sent_tokenize, word_tokenize
from pythainlp.corpus import thai_stopwords
import re
from tqdm import tqdm 
import pyLDAvis
import pyLDAvis.gensim_models
from top2vec import Top2Vec


# Set default Thai font
mpl.font_manager.fontManager.addfont('./THSarabunNew/THSarabunNew.ttf')
mpl.rc('font', family='TH Sarabun New', size=20)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

  from imp import reload


## 1. Data Preparation

### 1.1 Topic Selection

In [None]:
digitalb = pd.read_json('../datasets/bkkbiz_digital_processed.json')
digitalm = pd.read_json('../datasets/matichon_digital_processed.json')

### 1.2 Bag-of-Words

In [None]:
# Finction to store n_word in dict
def featurize(token_list):
    token_list=token_list
    features = {}
    for token in token_list:
        features[token]=1
    return features

In [None]:
digitalb_bow = digitalb['article_tokenize'].apply(featurize)
digitalm_bow = digitalm['article_tokenize'].apply(featurize)

In [None]:
digitalb_bow.shape, digitalm_bow.shape

In [None]:
vectorizerb = DictVectorizer(sparse=True)
digitalb_vec = vectorizerb.fit_transform(digitalb_bow)

vectorizerm = DictVectorizer(sparse=True)
digitalm_vec = vectorizerm.fit_transform(digitalm_bow)

In [None]:
digitalb_vec, digitalm_vec

## 2. Model Comparison

### 2.1 K-Means

In [None]:
def kmeans_topic(data_vec=None, n_clusters=10):
    kmeans = KMeans(n_clusters=n_clusters, 
                    init='k-means++', 
                    max_iter=100, 
                    n_init=1, 
                    random_state=42)

    kmeans.fit(data_vec)

    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    
    if data_vec == digitalb_vec:
        terms = vectorizerb.get_feature_names_out()
    elif data_vec == digitalm_vec:
        terms = vectorizerm.get_feature_names_out()
    else:
        print('error')

    for i in range(n_clusters):
        print(f'Topic {i+1}')
        print('-'*10)
        for i in order_centroids[i,:10]:
            print(terms[i])
        print('\n')

In [None]:
kmeans_topic(data_vec=digitalb_vec, n_clusters=10)

### 2.2 LDA

In [None]:
# Function for modeling with LDA
def lda_model(data=None, num_topics=None): #, topicid=None
    
    dictionary = Dictionary(data)
    corpus = [dictionary.doc2bow(txt) for txt in data]
    
    model = LdaModel(corpus=corpus, num_topics=num_topics)
    #topic = pd.DataFrame(model.get_topic_terms(topicid=1, topn=20)).rename(columns={0:'index', 1:'probability'})
    for i in range(num_topics):
        top_n = [dictionary[index] for index, prob in model.get_topic_terms(topicid=i, topn=30)]
        print(f'Topic {i+1}')
        print(top_n)
        print('-'*60)
    return model

In [None]:
# Visualization
def lda_vis(data=None, num_topics=20):
    dictionary = Dictionary(data)
    corpus = [dictionary.doc2bow(txt) for txt in data]
    model = LdaModel(corpus=corpus, num_topics=num_topics)
    pyLDAvis.enable_notebook()
    viz = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
    return viz

In [None]:
lda_10b = lda_model(data=econ_b['article_tokenize'], num_topics=10)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=10)
# Topic overlaps

In [None]:
lda_7b = lda_model(data=econ_b['article_tokenize'], num_topics=7)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=7)
# Topic overlap to the low right

In [None]:
lda_15b = lda_model(data=econ_b['article_tokenize'], num_topics=15)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=15)
# To the left with one to the right

In [None]:
lda_30b = lda_model(data=econ_b['article_tokenize'], num_topics=30)

In [None]:
#lda_vis(data=econ_b['article_tokenize'], num_topics=15)

In [None]:
lda_10m = lda_model(data=econ_m['article_tokenize'], num_topics=10)

In [None]:
#overlap กันสูง
#lda_vis(data=econ_m['article_tokenize'], num_topics=10)

In [None]:
lda_7m = lda_model(data=econ_m['article_tokenize'], num_topics=7)

In [None]:
#lda_vis(data=econ_m['article_tokenize'], num_topics=7)

### 2.3 NMF

In [None]:
#data = econ_b['article_tokenize'].apply(lambda x:' '.join(x))
#cvec = CountVectorizer(token_pattern= "\b[A-zก-๙][A-z\.\-ก-๙]*\b")
#data = cvec.fit_transform(data)

In [None]:
nmf = NMF(n_components=10, random_state=42)
nmf.fit(econ_b_vec)

In [None]:
nmf_features = nmf.transform(econ_b_vec)
nmf_features.shape

In [None]:
nmf.components_.shape

In [None]:
econ_b_components = pd.DataFrame(nmf.components_, columns=vectorizer_1.get_feature_names())
econ_b_components 

In [None]:
# Topic & Words
for topic in range(10):
    topic = econ_b_components.iloc[topic]
    print(type(topic))
    print(f'For topic {topic+1} the words with the highest value are:')
    print(topic.nlargest(10))
    print('\n')

### 2.4 Top2Vec
- fast learn, learn and deep learn

In [None]:
#print(Top2Vec.__doc__)

In [None]:
# min_count =
# ngram_vocab =
# embedding_model= (doc2vec)
# workers = cpu amount

In [None]:
econ_b_list = econ_b['article'].tolist()
#econ_b_list

In [None]:
top2vec = Top2Vec(documents=econ_b_list,
                  embedding_model='distiluse-base-multilingual-cased',
                  speed='fast-learn',
                  tokenizer='word_tokenize')

In [None]:
topic_sizes, topic_nums =top2vec.get_topic_sizes()
print(topic_sizes)
print(topic_nums)

In [None]:
topic_words, word_scores, topic_nums = top2vec.get_topics(18)
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'Words: {words}')

In [None]:
documents, document_scores, document_ids = top2vec.search_documents_by_topic(topic_num=0, 
                                                                             num_docs=10)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f'Document: {doc_id}, Score: {score}')
    print('-'*20)
    print(doc)
    print('-'*20)
    print()

### 2.5 BERTopic