<a href="https://colab.research.google.com/github/nguyenhaidang94/CustomerReviewAnalysis/blob/master/lda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import os
import re
import numpy.random as npr
from gensim.models.phrases import Phrases, Phraser

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Clone repo and install packages

In [3]:
!git clone https://github.com/deepai-solutions/core_nlp.git
!pip install python-crfsuite
!pip install sklearn_crfsuite

Cloning into 'core_nlp'...
remote: Enumerating objects: 205, done.[K
remote: Total 205 (delta 0), reused 0 (delta 0), pack-reused 205[K
Receiving objects: 100% (205/205), 9.69 MiB | 10.46 MiB/s, done.
Resolving deltas: 100% (101/101), done.
Collecting python-crfsuite
[?25l  Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)
[K     |████████████████████████████████| 757kB 6.7MB/s 
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.6
Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Installing collected packages: sklearn-crfsuite
Successfully installed sklearn-crfsuite-0.3.6


### Test tokenizer

In [4]:
from core_nlp.tokenization.crf_tokenizer import CrfTokenizer
# Test tokenizer
tokenizer = CrfTokenizer(config_root_path='core_nlp/tokenization/',
                          model_path='core_nlp/models/pretrained_tokenizer.crfsuite')
test_sent = "Thuế thu nhập cá nhân"
tokenized_sent = tokenizer.get_tokenized(test_sent)
print(tokenized_sent)

Loading model from file core_nlp/models/pretrained_tokenizer.crfsuite
Thuế_thu_nhập cá_nhân


## First look

In [0]:
unlabeled_folder_path = "/content/drive/My Drive/Master/2nd Semester/AdvMLDM/Project_AdvMLDM/dataset/unlabeled_data"
seperator = "/"
file_name = "foody_data.pkl"

def load_data():
  return pd.read_pickle(unlabeled_folder_path + seperator + file_name)

## Preprocessing

### Remove unnecessary columns

In [0]:
def remove_unnecessary_columns(df, columns):
  df.drop(columns=columns, inplace=True)

### Remove false mark and not having mark reviews

In [0]:
false_mark = ' ́'[1] + ' ̀'[1] + ' ̉'[1] + ' ̣'[1] + ' ̃'[1]
vietnam_alp = 'áàãảạăắằẳẵặâấầẩẫậóòỏõọôốồổộỗơớờởỡợúùủũụưứừữựửíìỉĩịýỳỷỹỵéèẻẽẹêếềểễệ'


def is_false_mark(string):
  """
    True if string is false mark. 
  """
  return (any ( c in false_mark for c in string))
    
  
def is_have_mark(string):
  """
    False if string is not have mark 
  """
  return (any ( c in vietnam_alp for c in string))

def find_false_mark(df, col):
  false_mark_indices = []
  for index, row in df.iterrows():
    if is_false_mark(row[col].lower()):
      false_mark_indices.append(index)
  return false_mark_indices

def find_not_having_mark(df, col):
  not_have_mark_indices = []
  for index, row in df.iterrows():
    if not is_have_mark(row[col].lower()):
      not_have_mark_indices.append(index)
  return not_have_mark_indices

def remove_false_mark_not_having_mark(df):
  false_mark_indices = find_false_mark(df, origin_review_col)
  not_have_mark_indices = find_not_having_mark(df, origin_review_col)
  print("False mark: {}, take {}%".format(len(false_mark_indices)
                                        , len(false_mark_indices)/len(df) * 100))
  print("Don't have mark: {}, take {}%".format(len(not_have_mark_indices)
                                             , len(not_have_mark_indices)/len(df) * 100))
  
  removed_indices = np.concatenate((false_mark_indices, not_have_mark_indices))
  df.drop(index=removed_indices, inplace=True)
  df.reset_index(drop=True, inplace=True)

### Remove new line, comma, special chars, numbers, special words

In [0]:
special_chars_re = "[+\-\"():;\\\/\^<>']"
numbers_re = "\d+"

# TODO: some reviews, user starts new line without dot
def remove_new_line(str):
  return str.replace("\n", "")

def remove_comma(str):
  return str.replace(",", "")

def remove_sepcial_chars(str):
  return re.sub(special_chars_re, "", str)

def remove_numbers(str):
  return re.sub(numbers_re, "", str)

def remove_special_word(str, special_word):
  return str.replace(special_word, "")

### Remove stop words

In [0]:
def read_file_content(path):
  with open(path, 'r') as f:
    return f.read()
def remove_stop_words(str, stop_words):
  words = str.split()
  result_words  = [word for word in words if word not in stop_words]
  return ' '.join(result_words)
def remove_stop_words_in_list(list_str, stop_words):
  resutl_str = []
  for str in list_str:
    resutl_str.append(remove_stop_words(str, stop_words))
  return resutl_str

### Lower case

In [0]:
def lower_case(str):
  return str.lower()

### Split sentence

In [0]:
def split_sentence(str):
  sentences = re.split("\.|!|\?|;", str)
  # remove empty sentence
  sentences = list(filter(None, sentences))
  # remove space from start and end of string
  for i in range(0, len(sentences)):
    sentences[i] = sentences[i].strip()
  return sentences

### Combine words using gensim Phrases

In [0]:
def collect_sentences(df):
  all_sentences = []
  for index, row in df.iterrows():
    sentences = split_sentence(df.loc[index, review_col])
    for sentence in sentences:
      all_sentences.append(sentence)
  return all_sentences

def train_bigram_model(df, min_count):
  print("Split sentences...")
  sentences = collect_sentences(df)
  print("Split words...")
  words = [sentence.split() for sentence in sentences]
  print("Train bi-gram model...")
  phrases = Phrases(words, min_count=min_count)
  bigram = Phraser(phrases)
  return bigram

def combine_words(review, bigram_model):
  words = review.split()
  new_review = " ".join(bigram_model[words])
  return new_review

### Tokenize

In [0]:
def tokenize(list_str, tokenizer):
  list_tokens = []
  for str in list_str:
    tokens = []
    if str != "":
      tokens = tokenizer.tokenize(str)
    list_tokens.append(tokens)
  return list_tokens

In [0]:
# Tokenizing n sentences
def tokenize_data(df, tokenizer, n_sentences):
  completed_sentences = 0
  all_tokens = []
  for index, row in df.iterrows():
    sentences = df.loc[index, sentences_col]
    tokens = tokenize(sentences, tokenizer)
    for token in tokens:
      all_tokens.append(token)
    completed_sentences += len(sentences)
    if (completed_sentences >= n_sentences):
      break

  return all_tokens

In [0]:
def collect_tokens(df, n_sentences):
  all_tokens = []
  completed_sentences = 0
  for index, row in df.iterrows():
    tokens = df.loc[index, tokens_col]
    for token in tokens:
      all_tokens.append(token)
    completed_sentences += len(tokens)
    if (completed_sentences >= n_sentences):
      break

  return all_tokens

def collect_all_tokens(df):
  all_tokens = []
  completed_sentences = 0
  for index, row in df.iterrows():
    list_tokens = df.loc[index, tokens_col]
    for tokens in list_tokens:
      all_tokens.append(tokens)

  return all_tokens

### Combine all steps

In [0]:
unnecessary_columns = ["time", "user_name", "user_link", "review_link"
                       , "avg_score", "location_point", "space_point"
                       , "quality_point", "service_point", "price_point"]
origin_review_col = "review_content"
review_col = "review_cleaned"
special_word = "Xem thêm"
file_stop_words = "/content/drive/My Drive/Master/2nd Semester/AdvMLDM/Project_AdvMLDM/vietnamese-stopwords.txt"
sentences_col = "sentences"
sentences_cleaned_col = "sentences_cleaned"
tokens_col = "tokens"
tokenizer = CrfTokenizer(config_root_path='core_nlp/tokenization/',
                          model_path='core_nlp/models/pretrained_tokenizer.crfsuite')
bigram_model_file = "/content/drive/My Drive/Master/2nd Semester/AdvMLDM/Project_AdvMLDM/dang/lda/bigram.dat"
bigram_min_count = 20

def preprocessing(df):
  print("Remove unnecessary columns...")
  remove_unnecessary_columns(df, unnecessary_columns)
  
  print("Removing false mark and not having mark reviews...")
  remove_false_mark_not_having_mark(df)
  print("Number of remaining reviews: {}".format(len(df)))
  
  # duplicate review to create column review_cleaned
  df[review_col] = df[origin_review_col].values
  
  print("Remove new line...")
  df[review_col] = df[review_col].apply(remove_new_line)
  
  print("Remove comma...")
  data[review_col] = data[review_col].apply(remove_comma)
  
  print("Remove special chars...")
  data[review_col] = data[review_col].apply(remove_sepcial_chars)
  
  print("Remove numbers...")
  data[review_col] = data[review_col].apply(remove_numbers)
  
  print("Remove special words...")
  data[review_col] = data[review_col].apply(remove_special_word, args=(special_word,))
  
  print("Lower case...")
  df[review_col] = df[review_col].apply(lower_case)
  
#   print("Train bigram model...")
#   bigram = train_bigram_model(df, bigram_min_count)
#   print("Save bigram model...")
#   bigram.save(bigram_model_file)

#   print("Load bigram model...")
#   bigram = Phrases.load(bigram_model_file)

#   print("Combine words...")
#   df[review_col] = df[review_col].apply(combine_words, args=(bigram,))
  
  print("Split sentences...")
  df[sentences_col] = df[review_col].apply(split_sentence)
  
  print("Remove stop words...")
  full_stop_words = read_file_content(file_stop_words)
  list_stop_words = full_stop_words.split("\n")
  df[sentences_cleaned_col] = df[sentences_col].apply(remove_stop_words_in_list, args=(list_stop_words,))
  
  print("Tokenizing...")
  df[tokens_col] = df[sentences_cleaned_col].apply(tokenize, args=(tokenizer,))
  
  print("Finish preprocessing")

### Run preprocessing

In [0]:
data = load_data()
preprocessing(data)

In [0]:
# save cleaned data
data_saved_path = "drive/My Drive/Master/JVN/2nd Semester/AdvMLDM/Project_AdvMLDM/dang/lda/cleaned_data.pkl"
data.to_pickle(data_saved_path)

## LDA

In [0]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import LdaMulticore
from gensim.models import LdaModel

In [0]:
def save_dict(dictionary, path):
  dictionary.save_as_text(path)
  
def save_lda_model(model, path):
  model.save(path)
  
def load_dict(path):
  return Dictionary.load_from_text(path)
  
def load_lda_model(path):
  return LdaModel.load(path)

In [0]:
def rank_topic(tuples):
  return sorted(tuples, key=lambda t: t[1], reverse = True)[0][0]

def get_topic(topic_mapping, topic):
  return topic_mapping.get(topic)

In [0]:
# load cleand data
data_saved_path = "drive/My Drive/Master/JVN/2nd Semester/AdvMLDM/Project_AdvMLDM/dang/lda/cleaned_data.pkl"
data = pd.read_pickle(data_saved_path)

In [34]:
data.head()

Unnamed: 0,brand_name,brand_link,review_content,review_cleaned,sentences,sentences_cleaned,tokens
0,El Sol - Meat & Wine - Võ Thị Sáu,https://www.foody.vn/ho-chi-minh/el-sol-meat-w...,Quán steak hiếm hoi mà mình cực kì ưng ý từ lâ...,quán steak hiếm hoi mà mình cực kì ưng ý từ lâ...,[quán steak hiếm hoi mà mình cực kì ưng ý từ l...,[quán steak hiếm hoi mình cực kì ưng ý lâu nay...,"[[quán, steak, hiếm_hoi, mình, cực_kì, ưng_ý, ..."
1,Busan Korean Food - Món Hàn Quốc - Đinh Tiên H...,https://www.foody.vn/ho-chi-minh/busan-korean-...,Vị trí dễ tìm. Giữ xe rất nhiệt tình. Dắt xe v...,vị trí dễ tìm. giữ xe rất nhiệt tình. dắt xe v...,"[vị trí dễ tìm, giữ xe rất nhiệt tình, dắt xe ...","[vị trí dễ tìm, giữ xe nhiệt tình, dắt xe khác...","[[vị_trí, dễ, tìm], [giữ, xe, nhiệt_tình], [dắ..."
2,TocoToco Bubble Tea - Cộng Hòa,https://www.foody.vn/ho-chi-minh/tocotoco-bubb...,"Địa điểm quán dễ tìm. Không gian cũng rộng, có...",địa điểm quán dễ tìm. không gian cũng rộng có ...,"[địa điểm quán dễ tìm, không gian cũng rộng có...","[địa điểm quán dễ tìm, gian rộng lầu view đẹp ...","[[địa_điểm, quán, dễ, tìm], [gian, rộng, lầu, ..."
3,Le Castella Viet Nam - Bánh Bông Lan Đài Loan ...,https://www.foody.vn/ho-chi-minh/le-castella-v...,Thấy bánh này đang hot rần rần mình cũng gọi t...,thấy bánh này đang hot rần rần mình cũng gọi t...,[thấy bánh này đang hot rần rần mình cũng gọi ...,[thấy bánh hot rần rần mình gọi thử hộp bánh b...,"[[thấy, bánh, hot, rần_rần, mình, gọi, thử, hộ..."
4,Tabletop - Boardgame & Coffee,https://www.foody.vn/ho-chi-minh/tabletop-boar...,Mỗi lần nghĩ tới boardgame thì sẽ nghĩ tới vô ...,mỗi lần nghĩ tới boardgame thì sẽ nghĩ tới vô ...,[mỗi lần nghĩ tới boardgame thì sẽ nghĩ tới vô...,"[lần nghĩ tới boardgame nghĩ tới vô, khoái món...","[[lần, nghĩ, tới, boardgame, nghĩ, tới, vô], [..."


In [37]:
dict_file = "drive/My Drive/Master/JVN/2nd Semester/AdvMLDM/Project_AdvMLDM/dang/lda/dict.txt"
lda_6_topics = "drive/My Drive/Master/JVN/2nd Semester/AdvMLDM/Project_AdvMLDM/dang/lda/6_topics/lda_model"

dictionary = load_dict(dict_file)
lda_model_tfidf = load_lda_model(lda_6_topics)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# all_sentences = []
# all_cleand_sentences = []
# count = 0
# for index, row in data.iterrows():
#   sentences = data.loc[index, sentences_col]
#   for sentence in sentences:
#     all_sentences.append(sentence)
  
#   cleaned_sentences = data.loc[index, sentences_cleaned_col]
#   for sentence in cleaned_sentences:
#     all_cleand_sentences.append(sentence)
# print("Number of sentences: {}".format(len(all_sentences)))

In [0]:
# Get documents
# n_docs = 948330
# documents = collect_tokens(data, n_docs)

# n_sampled_docs = 10
# print("Sample {} in {} docs".format(n_sampled_docs, len(documents)))
# doc_sampled_indices = npr.randint(0, high=len(documents), size=n_sampled_docs)
# for index in doc_sampled_indices:
#   print(documents[index])

# dictionary = Dictionary(documents)
# bow_corpus = [dictionary.doc2bow(doc) for doc in documents]
# tfidf_model = TfidfModel(bow_corpus)
# corpus_tfidf = tfidf_model[bow_corpus]

# # Run LDA model
# n_topics = 6
# lda_model_tfidf = LdaMulticore(corpus_tfidf, num_topics=n_topics, id2word=dictionary)

In [38]:
for index, topic in lda_model_tfidf.print_topics(-1, num_words=20):
  print('Topic: {} Word: {}'.format(index, topic))

Topic: 0 Word: 0.034*"trà" + 0.026*"sữa" + 0.016*"ngon" + 0.015*"uống" + 0.015*"vị" + 0.012*"ngọt" + 0.011*"mình" + 0.010*"trân_châu" + 0.010*"kem" + 0.009*"thơm" + 0.008*"thích" + 0.008*"thạch" + 0.006*"lắm" + 0.006*"đá" + 0.006*"thấy" + 0.006*"quá" + 0.006*"béo" + 0.006*"ở" + 0.006*"hơi" + 0.006*"loại"
Topic: 1 Word: 0.024*"xôi" + 0.014*"ăn" + 0.013*"gà" + 0.011*"ngon" + 0.011*"thịt" + 0.008*"cơm" + 0.007*"nướng" + 0.007*"món" + 0.007*"k" + 0.006*"bò" + 0.006*"mình" + 0.005*"xiên" + 0.005*"sốt" + 0.005*"mì" + 0.005*"kèm" + 0.005*"gọi" + 0.005*"chiên" + 0.005*"bánh" + 0.005*"pizza" + 0.005*"cay"
Topic: 2 Word: 0.030*"nhân_viên" + 0.023*"phục_vụ" + 0.018*"nhiệt_tình" + 0.012*"nhanh" + 0.011*"thân_thiện" + 0.009*"khách" + 0.009*"quán" + 0.008*"nhanh_nhẹn" + 0.008*"mình" + 0.008*"quay" + 0.008*"đông" + 0.008*"lâu" + 0.007*"dễ_thương" + 0.007*"khá" + 0.006*"đồ" + 0.006*"tốt" + 0.006*"chủ" + 0.006*"xe" + 0.006*"vui_vẻ" + 0.006*"order"
Topic: 3 Word: 0.020*"gian" + 0.016*"quán" + 0.016*"đẹp

In [0]:
# save_dict(dictionary, dict_file)
# save_lda_model(lda_model_tfidf, lda_6_topics)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
topic_mapping_6 = {
    0: "Đánh giá thức uống",
    1: "Đánh giá món ăn",
    2: "Phục vụ, nhân viên",
    3: "Không gian",
    4: "Topic 4",
    5: "Giá, chất lượng"
}

topic_col = "topic"
review_df = pd.DataFrame(columns=[review_col, topic_col])
n_sampled_sentences = 1000
# print("Sample {} in {} sentences".format(n_sampled_sentences, len(all_sentences)))
sentence_sampled_indices = npr.randint(0, high=len(all_sentences), size=n_sampled_sentences)
for index in sentence_sampled_indices:
#   print("\nSentence: {}".format(all_sentences[index]))
  topic = "EMPTY"
  sentence_cleaned = all_cleand_sentences[index]
#   print("S cleaned: {}".format(sentence_cleaned))
  if sentence_cleaned != "":
    tokens = tokenizer.tokenize(sentence_cleaned)
    topic_dist = lda_model_tfidf.get_document_topics(dictionary.doc2bow(tokens))
    topic_id = rank_topic(topic_dist)
    topic = get_topic(topic_mapping_6, topic_id)
  review_df = review_df.append({review_col: all_sentences[index], topic_col: topic}, ignore_index=True)
#   print("Topic: {}".format(topic))
review_df.to_csv("/content/drive/My Drive/Master/2nd Semester/AdvMLDM/Project_AdvMLDM/dang/lda/review_aspect.csv"
                 , sep=';', header=False, index=False)