In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pythainlp
from pythainlp.tokenize import sent_tokenize, word_tokenize
import gensim
import string
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import re
pyLDAvis.enable_notebook()
import sefr_cut
sefr_cut.load_model(engine='tl-deepcut-ws1000')

import warnings
warnings.filterwarnings('ignore')

# 1. Load and View Dataset

In [None]:
review_df = pd.read_csv('./data/CustomerReviews.csv')
print("Review Dimension (num reviews/num columns) : ",review_df.shape)
review_df.head()

# 2. Text Processing
• Text Wrangling เช่น ปรับแก้ไขคำผิดและลบตัวอักษระพิเศษต่างๆ <br>
• ตัดคำ

In [None]:
# Text Cleansing
# to correct misspelling word and remove the unwanted word or character
correct_word_list = {
    'โมโม่' : 'momo', 'มากก' : 'มาก', 'กกก' : 'ก', 'แวท' : 'vat', 'พิซเซอเรีย' : 'pizzeria', 'พิซซ่า' : 'pizza', 
    'เก้บ' : 'เก็บ', 'ชูชิ' : 'ซูชิ', 'ซึป' : 'ซุป', 'ชาบูชิ' : 'shabushi', 'อารามณ์' : 'อารมณ์', 'ส่งน' : 'ส่วน', 
    'สไลต์' : 'สไลด์', 'ประยุคก์' : 'ประยุกต์', 'ไอติม' : 'icecream', 'พาราไดส์' : 'paradise', 'พาราไดซ์' : 'paradise', 
    'อัพเดท' : 'update', 'นาราย' : 'narai ', 'พรีเมี่ยม' : 'พรีเมียม','บ๋วย' : 'บ๊วย', 'เฟรนไชส์' : 'แฟรนไชส์',
    'บุฟเฟ่ต์่ต์ต์ต์':'บุฟเฟ่ต์','บุฟเฟ่ต์่ต์ต์':'บุฟเฟ่ต์','บุฟเฟ่ต์ต์':'บุฟเฟ่ต์',
    'บุพเฟ่' : 'บุฟเฟ่ต์','บุฟเฟ่' : 'บุฟเฟ่ต์','บุฟเฟ' : 'บุฟเฟ่ต์', 'บุฟเฟต' : 'บุฟเฟ่ต์',
    'รสชาต' : 'รสชาติ'
}

unwanted_words = ['(', ')' ,'😆','🤣','"','','%','\u200b','::']

def do_text_preprocessing(text):
    # 1. correct some misspelling in text
    for old,new in correct_word_list.items():
        text = text.replace(old, new)
        
    # 2. remove unwanted word
    for word in unwanted_words:
        text = text.replace(word, '')
        
    # 3. remove punctuations character in text
    text = re.sub(r'[ๆฯ!#$&%\"\'()*+,-./:;<=>?@\[\]\\^_`{}|~]',' ', text)
    

    # 4. remove digit character in text
    text = re.sub(r'\d',' ', text)
    
    return text.strip()

# the review text is the review's headline and contents
review_df['review_text'] = review_df.apply(lambda x: do_text_preprocessing(x['Headline'] + ' ' + x['Review']), axis=1)
print(review_df['review_text'])

In [None]:
# word segmentation and remove the common words(stop words)
thai_stopwords = list(pythainlp.corpus.thai_stopwords())
unwanted_word = ['ร้าน','บาท','สำหรับ','ชื่อ','ทาน','ดี','กิน','อาหาร',
                 'ดู','คน','ตัว','ลอง','ตอน','เลือก','ใจ','ที่']
remove_word_list = thai_stopwords + unwanted_word

def do_word_tokenization(text):
    word_list = []
    for sentence in sent_tokenize(text, engine='whitespace+newline'):
        for words in sefr_cut.tokenize(sentence,k=100):
            for word in words:
                if len(word) >1 and word not in remove_word_list:
                    word_list.append(word)
    return word_list

%time review_df['review_token'] = review_df['review_text'].apply(do_word_tokenization)
print(review_df['review_token'])

# 3. Topic Models with Gensim
Gensim เป็น framework สำหรับทำ Topic Model, Text Similarity, Sematic Analytics และ Text Summarization โดย Gensim มีความยืดหยุ่นกว่า scikit-learn

### 3.1 Create a dictionary representation of the user's reviews.

In [None]:
review_token_list = review_df['review_token'].tolist()
dictionary = gensim.corpora.Dictionary(review_token_list)
print(f'Number of Vocabulary : {len(dictionary)} \n')
print(f'Dictionary Items : {list(dictionary.items())[:30]}\n')

In [None]:
# Transforming review list into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in review_token_list]
print(bow_corpus[0])

In [None]:
# total papers in the corpus
print('Total number of corpus:', len(bow_corpus))

# viewing actual terms and their counts
print([(dictionary[idx] , freq) for idx, freq in bow_corpus[0]])

### 3.2 Topic Models with Latent Dirichlet Allocation (LDA)

In [None]:
%%time
TOTAL_TOPICS = 5
lda_model = gensim.models.LdaModel(corpus=bow_corpus, 
                                   id2word=dictionary, 
                                   chunksize=1000, 
                                   alpha='auto', 
                                   eta='auto', 
                                   random_state=42,
                                   iterations=500, 
                                   num_topics=TOTAL_TOPICS, 
                                   passes=20, 
                                   eval_every=None)

In [None]:
for topic_id, topic in lda_model.print_topics(num_topics=10, num_words=20):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

In [None]:
topics_coherences = lda_model.top_topics(bow_corpus, topn=20)
avg_coherence_score = np.mean([item[1] for item in topics_coherences])
print('Avg. Coherence Score:', avg_coherence_score)

In [None]:
topics_with_wts = [item[0] for item in topics_coherences]
print('LDA Topics with Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([(term, round(wt, 3)) for wt, term in topic])
    print()

In [None]:
print('LDA Topics without Weights')
print('='*50)
for idx, topic in enumerate(topics_with_wts):
    print('Topic #'+str(idx+1)+':')
    print([term for wt, term in topic])
    print()

In [None]:
cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, 
                                                      corpus=bow_corpus, 
                                                      texts=review_token_list,
                                                      dictionary=dictionary, 
                                                      coherence='c_v')
avg_coherence_cv = cv_coherence_model_lda.get_coherence()

umass_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, 
                                                         corpus=bow_corpus, 
                                                         texts=review_token_list,
                                                         dictionary=dictionary, 
                                                         coherence='u_mass')
avg_coherence_umass = umass_coherence_model_lda.get_coherence()

perplexity = lda_model.log_perplexity(bow_corpus)

print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)
print('Model Perplexity:', perplexity)

### 3.3 Finding Optimal Number of Topics

In [None]:
from tqdm import tqdm

def topic_model_coherence_generator(corpus, texts, dictionary, 
                                    start_topic_count=2, end_topic_count=10, step=1,
                                    cpus=1):
    
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        lda_model = gensim.models.LdaModel(corpus=bow_corpus, 
                                           id2word=dictionary, 
                                           chunksize=1740,
                                           alpha='auto', 
                                           eta='auto', 
                                           random_state=42,
                                           iterations=500, 
                                           num_topics=topic_nums,
                                           passes=20,
                                           eval_every=None)
        
        cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, 
                                                      corpus=bow_corpus, 
                                                      texts=review_token_list,
                                                      dictionary=dictionary, 
                                                      coherence='c_v')
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)
    
    return models, coherence_scores

In [None]:
lda_models, coherence_scores = topic_model_coherence_generator(corpus=bow_corpus, 
                                                               texts=review_token_list,
                                                               dictionary=dictionary, 
                                                               start_topic_count=2,
                                                               end_topic_count=15, 
                                                               step=1, cpus=16)

In [None]:
coherence_df = pd.DataFrame({'Number of Topics': range(2, 16, 1),
                             'Coherence Score': np.round(coherence_scores, 4)})
coherence_df.sort_values(by=['Coherence Score'], ascending=False).head(15)

In [None]:
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
%matplotlib inline

x_ax = range(2, 16, 1)
y_ax = coherence_scores
plt.figure(figsize=(12, 6))
plt.plot(x_ax, y_ax, c='r')
plt.axhline(y=0.3423, c='k', linestyle='--', linewidth=2)
plt.rcParams['figure.facecolor'] = 'white'
xl = plt.xlabel('Number of Topics')
yl = plt.ylabel('Coherence Score')

In [None]:
best_model_idx = coherence_df[coherence_df['Number of Topics'] == 7].index[0]
best_lda_model = lda_models[best_model_idx]
best_lda_model.num_topics

In [None]:
topics = [[(term, round(wt, 3)) 
               for term, wt in best_lda_model.show_topic(n, topn=20)] 
                   for n in range(0, best_lda_model.num_topics)]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, best_lda_model.num_topics+1)]
                         )
topics_df

# 4. Print Result from Topic Modeling

In [None]:
result_df = pd.DataFrame()

tm_results = best_lda_model[bow_corpus]
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] 
                     for topics in tm_results]

# result_df['Restaurant'] = review_df['Restaurant']
result_df['User'] = review_df['User']
result_df['Review'] = review_df['review_text']
result_df['Selected Topic'] = [item[0]+1 for item in corpus_topics]
result_df['Topic Text'] = [topics_df.iloc[t[0]]['Terms per Topic'] for t in corpus_topics]

In [None]:
from pandas import option_context

with option_context('display.max_colwidth', 100):
    display(result_df.head())