## **Load libraries and data**

In [None]:
!pip install pythainlp
!pip install pyLDAvis



You should consider upgrading via the 'c:\users\ngeklai\anaconda3\python.exe -m pip install --upgrade pip' command.


In [None]:
import pandas as pd
import pythainlp
import gensim
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
df = pd.read_csv('Wongnai Reviews - Small.csv')

In [None]:
df.tail()

## **Tokenize Words**

In [None]:
stopwords = list(pythainlp.corpus.thai_stopwords())
removed_words = [' ', '  ', '\n', 'ร้าน', '(', ')']
screening_words = stopwords + removed_words

def tokenize_with_space(sentence):
  merged = ''
  words = pythainlp.word_tokenize(str(sentence), engine='newmm')
  for word in words:
    if word not in screening_words:
      merged = merged + ',' + word
  return merged[1:]

In [None]:
df['Review_tokenized'] = df['Review'].apply(lambda x: tokenize_with_space(x))

In [None]:
df.tail()

## **Create Dictionary**

In [None]:
documents = df['Review_tokenized'].to_list()
texts = [[text for text in doc.split(',')] for doc in documents]
dictionary = gensim.corpora.Dictionary(texts)

In [None]:
print(dictionary.token2id.keys())

In [None]:
gensim_corpus = [dictionary.doc2bow(text, allow_update=True) for text in texts]
word_frequencies = [[(dictionary[id], frequence) for id, frequence in couple] for couple in gensim_corpus]

## **Topic Modeling**

In [None]:
num_topics = 30
chunksize = 4000 # size of the doc looked at every pass
passes = 20 # number of passes through documents
iterations = 50
eval_every = 1  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = gensim.models.LdaModel(corpus=gensim_corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

In [None]:
pyLDAvis.gensim.prepare(model, gensim_corpus, dictionary)

In [None]:
model.show_topic(1)

In [None]:
df['topics'] = df['Review_tokenized'].apply(lambda x: model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][0])
df['score'] = df['Review_tokenized'].apply(lambda x: model.get_document_topics(dictionary.doc2bow(x.split(',')))[0][1])

In [None]:
df.tail()