## Using Part of Speech to Extract Meaningful Keywords

## Setup

In [None]:
!pip install stanza
!pip install -U tqdm
!pip install translators --upgrade
!pip install google-cloud-language==2.4.2

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import sys
sys.path.append('/content/gdrive/My Drive/nlp')

In [None]:
# for progress bars
from tqdm import tqdm

tqdm.pandas()

In [None]:
# For saving new sheet on same excel file
import os

def save_excel_sheet(df, filepath, sheetname, index=False):
    # Create file if it does not exist
    if not os.path.exists(filepath):
        df.to_excel(filepath, sheet_name=sheetname, index=index)

    # Otherwise, add a sheet. Overwrite if there exists one with the same name.
    else:
        with pd.ExcelWriter(filepath, engine='openpyxl', if_sheet_exists='replace', mode='a') as writer:
            df.to_excel(writer, sheet_name=sheetname, index=index)

## Load Data which has been Preprocessed

In [None]:
import pandas as pd

df = pd.read_excel('/content/gdrive/MyDrive/data/clean/attractions_reviews_cleaned.xlsx')

df

Unnamed: 0,user,date,rating,review,page_url,date_scraped,source,attraction
0,desuka Panjaitan,2021,10,"Belum Pernah Masuk kedalam , hanya diluar saja...",https://www.google.com/search?q=universal+stud...,2022-04-27,google_reviews,uss
1,Riska Septi Damayanti,2021,10,Sumpahh kalau ke Singapore wajib banget ke sin...,https://www.google.com/search?q=universal+stud...,2022-04-27,google_reviews,uss
2,Mochamad Naufal Irfansyah,2021,10,"Seru bisa jajan coklat enak dan murah , kalau ...",https://www.google.com/search?q=universal+stud...,2022-04-27,google_reviews,uss
3,T1y25,2022,10,"Luas banget , banyak spot fotonya , wahananya ...",https://www.google.com/search?q=universal+stud...,2022-04-27,google_reviews,uss
4,Rengga Utami,2021,10,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,https://www.google.com/search?q=universal+stud...,2022-04-27,google_reviews,uss
...,...,...,...,...,...,...,...,...
4809,Brandon,2018,10,Harga murah dan cocok untuk berkunjung bersama...,https://www.klook.com/id/activity/120-adventur...,2022-06-08,klook,adventure_cove
4810,Alva,2018,10,sangat menyenangkan . berbeda dengan waterpark...,https://www.klook.com/id/activity/120-adventur...,2022-06-08,klook,adventure_cove
4811,Hariyanto,2018,10,"Tempatnya sangat bagus , dan permainannya sang...",https://www.klook.com/id/activity/120-adventur...,2022-06-08,klook,adventure_cove
4812,sandra,2018,10,"Bagus permainan airnya , anak saya suka sekali...",https://www.klook.com/id/activity/120-adventur...,2022-06-08,klook,adventure_cove


In [None]:
# drop columns
df.drop(['user', 'page_url', 'date_scraped'], axis = 1, inplace = True)

In [None]:
# insert review_id column
df.insert(0, "review_id", range(0, len(df)))

In [None]:
df

Unnamed: 0,review_id,date,rating,review,source,attraction
0,0,2021,10,"Belum Pernah Masuk kedalam , hanya diluar saja...",google_reviews,uss
1,1,2021,10,Sumpahh kalau ke Singapore wajib banget ke sin...,google_reviews,uss
2,2,2021,10,"Seru bisa jajan coklat enak dan murah , kalau ...",google_reviews,uss
3,3,2022,10,"Luas banget , banyak spot fotonya , wahananya ...",google_reviews,uss
4,4,2021,10,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,google_reviews,uss
...,...,...,...,...,...,...
4809,4809,2018,10,Harga murah dan cocok untuk berkunjung bersama...,klook,adventure_cove
4810,4810,2018,10,sangat menyenangkan . berbeda dengan waterpark...,klook,adventure_cove
4811,4811,2018,10,"Tempatnya sangat bagus , dan permainannya sang...",klook,adventure_cove
4812,4812,2018,10,"Bagus permainan airnya , anak saya suka sekali...",klook,adventure_cove


## Part of Speech Distribution

ADJ: adjective

ADP: adposition

ADV: adverb

AUX: auxiliary

CCONJ: coordinating conjunction

DET: determiner

INTJ: interjection

NOUN: noun

NUM: numeral

PART: particle

PRON: pronoun

PROPN: proper noun

PUNCT: punctuation

SCONJ: subordinating conjunction

SYM: symbol

VERB: verb

X: other

*Source: https://universaldependencies.org/u/pos/*

## Download Stanza Library

https://stanfordnlp.github.io/stanza/pos.html

In [None]:
# this cell takes approximately 2 mins to run
import stanza

stanza.download('id')       # This downloads the Indonesian models for the neural pipeline
nlp = stanza.Pipeline('id', processors='tokenize,pos', verbose=False, use_gpu=False) # This sets up a default neural pipeline in Indonesian

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-06-26 03:35:37 INFO: Downloading default packages for language: id (Indonesian)...


Downloading https://huggingface.co/stanfordnlp/stanza-id/resolve/v1.4.0/models/default.zip:   0%|          | 0…

2022-06-26 03:35:41 INFO: Finished downloading models and saved to /root/stanza_resources.


## Sample of how Stanza works on One Sentence

In [None]:
id_doc = nlp("Seruuu bangett😄😄 tapi Sayang pas kesitu Baru Berapaan Kali main hujan gede😟.")

id_doc

[
  [
    {
      "id": 1,
      "text": "Seruuu",
      "upos": "PROPN",
      "xpos": "X--",
      "start_char": 0,
      "end_char": 6
    },
    {
      "id": 2,
      "text": "bangett😄😄",
      "upos": "PROPN",
      "xpos": "X--",
      "start_char": 7,
      "end_char": 16
    },
    {
      "id": 3,
      "text": "tapi",
      "upos": "CCONJ",
      "xpos": "S--",
      "start_char": 17,
      "end_char": 21
    },
    {
      "id": 4,
      "text": "Sayang",
      "upos": "PROPN",
      "xpos": "NSD",
      "start_char": 22,
      "end_char": 28
    },
    {
      "id": 5,
      "text": "pas",
      "upos": "VERB",
      "xpos": "VSA",
      "feats": "Mood=Ind",
      "start_char": 29,
      "end_char": 32
    },
    {
      "id": 6,
      "text": "kesitu",
      "upos": "VERB",
      "xpos": "X--",
      "start_char": 33,
      "end_char": 39
    },
    {
      "id": 7,
      "text": "Baru",
      "upos": "PROPN",
      "xpos": "ASP",
      "start_char": 40,
      "end_char":

## Create and Map Functions to DF

In [None]:
# annotate review
def doc(review):
  return nlp(review)

In [None]:
# count the number of words in each review
def word_count(doc):
    for sent in doc.sentences:
        return len(sent.words)

In [None]:
# extract parts of speech
def extract_pos(doc):
    parsed_text = []
    for sent in doc.sentences:
        for wrd in sent.words:
            parsed_text.append((wrd.text.lower(), wrd.pos))
    return parsed_text

In [None]:
# this cell takes approximately 5 mins to run
# apply doc function to review column in df
df['nlp'] = df['review'].progress_map(doc)

100%|██████████| 4814/4814 [05:26<00:00, 14.73it/s]


In [None]:
# apply extract_pos function to review column in df
df['pos'] = df['nlp'].progress_map(extract_pos)

100%|██████████| 4814/4814 [00:00<00:00, 50919.28it/s]


In [None]:
# apply word_count function to review column in df
df['word_count'] = df['nlp'].progress_map(word_count)

100%|██████████| 4814/4814 [00:00<00:00, 265973.52it/s]


In [None]:
# rearrange columns
df = df[['review_id', 'date',	'rating',	'source', 'attraction', 'word_count', 'review', 'nlp', 'pos']]

In [None]:
df.head()

Unnamed: 0,review_id,date,rating,source,attraction,word_count,review,nlp,pos
0,0,2021,10,google_reviews,uss,33,"Belum Pernah Masuk kedalam , hanya diluar saja...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(belum, PART), (pernah, ADV), (masuk, VERB), ..."
1,1,2021,10,google_reviews,uss,51,Sumpahh kalau ke Singapore wajib banget ke sin...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(sumpahh, PROPN), (kalau, SCONJ), (ke, ADP), ..."
2,2,2021,10,google_reviews,uss,24,"Seru bisa jajan coklat enak dan murah , kalau ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(seru, PROPN), (bisa, AUX), (jajan, VERB), (c..."
3,3,2022,10,google_reviews,uss,11,"Luas banget , banyak spot fotonya , wahananya ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(luas, NOUN), (banget, X), (,, PUNCT), (banya..."
4,4,2021,10,google_reviews,uss,18,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(wahana, PROPN), (sangat, ADV), (lengkap, ADJ..."


In [None]:
df.to_excel('/content/gdrive/MyDrive/data/output/attractions_stanfordnlp_v13.xlsx', sheet_name='pos', index=False)

## Extract token and pos into one row in DF

In [None]:
df_pos = df[['review_id', 'date', 'attraction', 'pos']].explode('pos')
df_pos.reset_index(drop=True, inplace = True)
df_pos.head()

Unnamed: 0,review_id,date,attraction,pos
0,0,2021,uss,"(belum, PART)"
1,0,2021,uss,"(pernah, ADV)"
2,0,2021,uss,"(masuk, VERB)"
3,0,2021,uss,"(kedalam, ADP)"
4,0,2021,uss,"(,, PUNCT)"


In [None]:
# extract token and pos tag from pos column
token = []
pos_tag = []
for i in range(len(df_pos)):
  
    token.append(df_pos['pos'][i][0])
    pos_tag.append(df_pos['pos'][i][1])

In [None]:
df_pos['token'] = token
df_pos['pos_tag'] = pos_tag

df_pos.head()

Unnamed: 0,review_id,date,attraction,pos,token,pos_tag
0,0,2021,uss,"(belum, PART)",belum,PART
1,0,2021,uss,"(pernah, ADV)",pernah,ADV
2,0,2021,uss,"(masuk, VERB)",masuk,VERB
3,0,2021,uss,"(kedalam, ADP)",kedalam,ADP
4,0,2021,uss,"(,, PUNCT)",",",PUNCT


In [None]:
# this cell takes approximately 18 sec to run
save_excel_sheet(df_pos, '/content/gdrive/MyDrive/data/output/attractions_stanfordnlp_v13.xlsx', 'breakdown', index=False)

## Get Bigrams

In [None]:
def get_bigram(pos_1, pos_2):
    all_bigram = []

    for j in range(len(df)):

      bigram = []

      for i in range(len(df['pos'][j])):

        if df['pos'][j][i-1][1] == pos_1 and df['pos'][j][i][1] == pos_2:
          bigram.append(df['pos'][j][i-1][0] + " " + df['pos'][j][i][0])

      all_bigram.append(bigram)
    
    return all_bigram

In [None]:
bi_adv_adj = get_bigram('ADV', 'ADJ')

In [None]:
df['bi_adv_adj'] = pd.Series(bi_adv_adj)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df.head()

Unnamed: 0,review_id,date,rating,source,attraction,word_count,review,nlp,pos,bi_adv_adj
0,0,2021,10,google_reviews,uss,33,"Belum Pernah Masuk kedalam , hanya diluar saja...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(belum, PART), (pernah, ADV), (masuk, VERB), ...",[]
1,1,2021,10,google_reviews,uss,51,Sumpahh kalau ke Singapore wajib banget ke sin...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(sumpahh, PROPN), (kalau, SCONJ), (ke, ADP), ...",[]
2,2,2021,10,google_reviews,uss,24,"Seru bisa jajan coklat enak dan murah , kalau ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(seru, PROPN), (bisa, AUX), (jajan, VERB), (c...",[]
3,3,2022,10,google_reviews,uss,11,"Luas banget , banyak spot fotonya , wahananya ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(luas, NOUN), (banget, X), (,, PUNCT), (banya...",[]
4,4,2021,10,google_reviews,uss,18,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(wahana, PROPN), (sangat, ADV), (lengkap, ADJ...",[sangat lengkap]


## Get Trigrams

In [None]:
def get_trigram(pos_1, pos_2, pos_3):
    all_trigram = []

    for j in range(len(df)):

        trigram = []

        if len(df['pos'][j]) >= 2:

            for i in range(len(df['pos'][j])):

                if df['pos'][j][i-2][1] == pos_1 and df['pos'][j][i-1][1] == pos_2 and df['pos'][j][i][1] == pos_3:
                    trigram.append(df['pos'][j][i-2][0] + " " + df['pos'][j][i-1][0] + " " + df['pos'][j][i][0])

            all_trigram.append(trigram)

        else:
            all_trigram.append([])
      
    return all_trigram

In [None]:
tri_adv_adj_noun = get_trigram('ADV', 'ADJ', 'NOUN')
tri_noun_adv_adj = get_trigram('NOUN', 'ADV', 'ADJ')

trigram = tri_adv_adj_noun + tri_noun_adv_adj

In [None]:
df['tri_adv_adj_noun'] = pd.Series(trigram)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
df.head()

Unnamed: 0,review_id,date,rating,source,attraction,word_count,review,nlp,pos,bi_adv_adj,tri_adv_adj_noun
0,0,2021,10,google_reviews,uss,33,"Belum Pernah Masuk kedalam , hanya diluar saja...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(belum, PART), (pernah, ADV), (masuk, VERB), ...",[],[]
1,1,2021,10,google_reviews,uss,51,Sumpahh kalau ke Singapore wajib banget ke sin...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(sumpahh, PROPN), (kalau, SCONJ), (ke, ADP), ...",[],[]
2,2,2021,10,google_reviews,uss,24,"Seru bisa jajan coklat enak dan murah , kalau ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(seru, PROPN), (bisa, AUX), (jajan, VERB), (c...",[],[]
3,3,2022,10,google_reviews,uss,11,"Luas banget , banyak spot fotonya , wahananya ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(luas, NOUN), (banget, X), (,, PUNCT), (banya...",[],[]
4,4,2021,10,google_reviews,uss,18,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[(wahana, PROPN), (sangat, ADV), (lengkap, ADJ...",[sangat lengkap],[sangat lengkap pokok]


In [None]:
# this cell takes approximately 30 sec to run
save_excel_sheet(df, '/content/gdrive/MyDrive/data/output/attractions_stanfordnlp_v13.xlsx', 'ngram', index=False)

## Create DF with one ngram per row

In [None]:
# separate bigrams into one per row
df1 = df[['review_id', 'date', 'rating', 'attraction', 'bi_adv_adj']].explode('bi_adv_adj')

# drop rows which are empty
df1 = df1.dropna()

# insert column to indicate these are bigrams
df1.insert(1, 'ngram', 'bigram')

# rename column to indo
df1 = df1.rename(columns={"bi_adv_adj": "indo"})

In [None]:
# separate trigrams into one per row
df2 = df[['review_id', 'date', 'rating', 'attraction', 'tri_adv_adj_noun']].explode('tri_adv_adj_noun')

# drop rows which are empty
df2 = df2.dropna()

# insert column to indicate these are trigrams
df2.insert(1, 'ngram', 'trigram')

# rename column to indo
df2 = df2.rename(columns={"tri_adv_adj_noun": "indo"})

In [None]:
# combine both df
df_sentiment = pd.concat([df1, df2], ignore_index=True)

In [None]:
df_sentiment

Unnamed: 0,review_id,ngram,date,rating,attraction,indo
0,4,bigram,2021,10,uss,sangat lengkap
1,8,bigram,2020,10,uss,sangat nyaman
2,12,bigram,2019,10,uss,cukup mahal
3,12,bigram,2019,10,uss,lebih baik
4,19,bigram,2020,10,uss,terlalu besar
...,...,...,...,...,...,...
2396,4718,trigram,2019,10,skyline_luge,semakin siang antrian
2397,4730,trigram,2019,9,wings_of_time,kurang lama durasi
2398,4767,trigram,2018,8,skyline_luge,terlalu terik matahari
2399,4804,trigram,2019,10,adventure_cove,paling spesial wahana


## Google Cloud Natural Language API for Sentiment Analysis

Installation:
https://cloud.google.com/natural-language/docs/reference/libraries#create-service-account-console

In [None]:
import time

In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/content/gdrive/MyDrive/data/sdcsentiment-353012-947babb67df7.json"

# Imports the Google Cloud client library
from google.cloud import language_v1

# Instantiates a client
client = language_v1.LanguageServiceClient()

In [None]:
def get_sentiment_gcp(text):
  document = language_v1.Document(
    content=text, type_=language_v1.Document.Type.PLAIN_TEXT, language='id'
)
  sentiment = client.analyze_sentiment(
    request={"document": document}
).document_sentiment
  
  if sentiment.score >= 0.5:
    output = 'positive'
    time.sleep(0.1)
  elif sentiment.score >= 0:
    output = 'neutral'
    time.sleep(0.1)
  else:
    output = 'negative'
    time.sleep(0.1)
  return output

In [None]:
# Sample Google Cloud Platform Sentiment Analysis API on a phrase
# get_sentiment_gcp('sangat lengkap')

'positive'

In [None]:
# this cell takes approx 6 min to run for 2405 rows with a delay of 0.1 sec per row
df_sentiment["sentiment"] = df_sentiment['indo'].progress_apply(lambda x: get_sentiment_gcp(x))

100%|██████████| 2401/2401 [05:33<00:00,  7.20it/s]


In [None]:
df_sentiment.head()

Unnamed: 0,review_id,ngram,date,rating,attraction,indo,sentiment
0,4,bigram,2021,10,uss,sangat lengkap,positive
1,8,bigram,2020,10,uss,sangat nyaman,positive
2,12,bigram,2019,10,uss,cukup mahal,negative
3,12,bigram,2019,10,uss,lebih baik,positive
4,19,bigram,2020,10,uss,terlalu besar,negative


In [None]:
# this cell takes approx 1 min to run
save_excel_sheet(df_sentiment, '/content/gdrive/MyDrive/data/output/attractions_stanfordnlp_v13.xlsx', 'sentiment', index=False)

## Google Translate Ngrams to English

In [None]:
import translators as ts

Using United States server backend.


In [None]:
ts._google.language_map

In [None]:
# this cell takes approx 38 min to run for 2405 rows
df_sentiment['en'] = df_sentiment['indo'].progress_apply(lambda x: ts.google(x, from_language='id', to_language='en').lower())

100%|██████████| 2401/2401 [37:23<00:00,  1.07it/s]


In [None]:
df_sentiment = df_sentiment[['review_id', 'ngram',	'date',	'attraction', 'rating',	'sentiment', 'indo', 'en']]

In [None]:
df_sentiment.head()

Unnamed: 0,review_id,ngram,date,attraction,rating,sentiment,indo,en
0,4,bigram,2021,uss,10,positive,sangat lengkap,very complete
1,8,bigram,2020,uss,10,positive,sangat nyaman,very comfortable
2,12,bigram,2019,uss,10,negative,cukup mahal,quite expensive
3,12,bigram,2019,uss,10,positive,lebih baik,better
4,19,bigram,2020,uss,10,negative,terlalu besar,too big


In [None]:
# this cell takes approx 1 min to run
save_excel_sheet(df_sentiment, '/content/gdrive/MyDrive/data/output/attractions_stanfordnlp_v13.xlsx', 'sentiment_en', index=False)

## Link Ngrams back to Indonesian Reviews

In [None]:
# this is how our previous df looks like
df

Unnamed: 0,review_id,date,rating,source,attraction,word_count,review,nlp,pos
0,0,2021,10,google_reviews,uss,33,"Belum Pernah Masuk kedalam , hanya diluar saja...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('belum', 'PART'), ('pernah', 'ADV'), ('masuk..."
1,1,2021,10,google_reviews,uss,51,Sumpahh kalau ke Singapore wajib banget ke sin...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('sumpahh', 'PROPN'), ('kalau', 'SCONJ'), ('k..."
2,2,2021,10,google_reviews,uss,24,"Seru bisa jajan coklat enak dan murah , kalau ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('seru', 'PROPN'), ('bisa', 'AUX'), ('jajan',..."
3,3,2022,10,google_reviews,uss,11,"Luas banget , banyak spot fotonya , wahananya ...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('luas', 'NOUN'), ('banget', 'X'), (',', 'PUN..."
4,4,2021,10,google_reviews,uss,18,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('wahana', 'PROPN'), ('sangat', 'ADV'), ('len..."
...,...,...,...,...,...,...,...,...,...
4809,4809,2018,10,klook,adventure_cove,24,Harga murah dan cocok untuk berkunjung bersama...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('harga', 'NOUN'), ('murah', 'ADJ'), ('dan', ..."
4810,4810,2018,10,klook,adventure_cove,9,sangat menyenangkan . berbeda dengan waterpark...,"[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('sangat', 'ADV'), ('menyenangkan', 'VERB'), ..."
4811,4811,2018,10,klook,adventure_cove,15,"Tempatnya sangat bagus , dan permainannya sang...","[\n [\n {\n ""id"": [\n 1,\n ...","[('tempat', 'NOUN'), ('nya', 'PRON'), ('sangat..."
4812,4812,2018,10,klook,adventure_cove,15,"Bagus permainan airnya , anak saya suka sekali...","[\n [\n {\n ""id"": 1,\n ""text"": ""...","[('bagus', 'ADJ'), ('permainan', 'NOUN'), ('ai..."


In [None]:
# use left join to merge df_sentiment with df by review_id
df_sentiment = df_sentiment.merge(df[['review_id', 'review', 'word_count']], on='review_id', how='left')
df_sentiment

Unnamed: 0,review_id,ngram,date,attraction,rating,sentiment,indo,en,review,word_count
0,4,bigram,2021,uss,10,positive,sangat lengkap,very complete,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,18
1,8,bigram,2020,uss,10,positive,sangat nyaman,very comfortable,"Tempat luas dengan berbagai tema , atraksi dan...",56
2,12,bigram,2019,uss,10,negative,cukup mahal,quite expensive,"Good place.. harganya emang cukup mahal , tapi...",4
3,12,bigram,2019,uss,10,positive,lebih baik,better,"Good place.. harganya emang cukup mahal , tapi...",4
4,19,bigram,2020,uss,10,negative,terlalu besar,too big,Ga terlalu besar sebenarnya theme park ini tap...,28
...,...,...,...,...,...,...,...,...,...,...
2396,4718,trigram,2019,skyline_luge,10,positive,semakin siang antrian,in the afternoon the queue,Lebih mudah dengan pembelian di traveloka . Ji...,27
2397,4730,trigram,2019,wings_of_time,9,negative,kurang lama durasi,less than long duration,"Best place and best perform , hanya terasa kur...",12
2398,4767,trigram,2018,skyline_luge,8,negative,terlalu terik matahari,too hot the sun,Kalau naik hanya sekali pasti kurang ! . Kami ...,140
2399,4804,trigram,2019,adventure_cove,10,positive,paling spesial wahana,the most special vehicle,"Sangat menyenangkan , banyak wahana menarik , ...",29


In [None]:
# this cell takes approx 1 min to run
save_excel_sheet(df_sentiment, '/content/gdrive/MyDrive/data/output/attractions_stanfordnlp_v13.xlsx', 'ngram_review', index=False)

## Google Translate Reviews to English

In [None]:
import translators as ts

Using United States server backend.


In [None]:
ts._google.language_map

In [None]:
# this cell takes approx 51 min to run for 2405 rows
df_sentiment['review_en'] = df_sentiment['review'].progress_apply(lambda x: ts.google(x, from_language='id', to_language='en'))

100%|██████████| 2401/2401 [46:10<00:00,  1.15s/it]


In [None]:
df_sentiment = df_sentiment[['ngram',	'date',	'attraction',	'rating',	'sentiment',	'indo',	'en',	'review',	'review_en', 'review_id',	'word_count']]

In [None]:
df_sentiment

Unnamed: 0,ngram,date,attraction,rating,sentiment,indo,en,review,review_en,review_id,word_count
0,bigram,2021,uss,10,positive,sangat lengkap,very complete,Wahana Sangat Lengkap Pokoknya Tempatnya Luar ...,"The vehicle is very complete, anyway, the plac...",4,18
1,bigram,2020,uss,10,positive,sangat nyaman,very comfortable,"Tempat luas dengan berbagai tema , atraksi dan...","A wide place with various themes, attractions ...",8,56
2,bigram,2019,uss,10,negative,cukup mahal,quite expensive,"Good place.. harganya emang cukup mahal , tapi...",Good place ... the price is indeed quite expen...,12,4
3,bigram,2019,uss,10,positive,lebih baik,better,"Good place.. harganya emang cukup mahal , tapi...",Good place ... the price is indeed quite expen...,12,4
4,bigram,2020,uss,10,negative,terlalu besar,too big,Ga terlalu besar sebenarnya theme park ini tap...,Not too big actually this theme park but the q...,19,28
...,...,...,...,...,...,...,...,...,...,...,...
2396,trigram,2019,skyline_luge,10,positive,semakin siang antrian,in the afternoon the queue,Lebih mudah dengan pembelian di traveloka . Ji...,It's easier with purchases at Traveloka. If th...,4718,27
2397,trigram,2019,wings_of_time,9,negative,kurang lama durasi,less than long duration,"Best place and best perform , hanya terasa kur...","Best Place and Best Perform, only feels less d...",4730,12
2398,trigram,2018,skyline_luge,8,negative,terlalu terik matahari,too hot the sun,Kalau naik hanya sekali pasti kurang ! . Kami ...,"If it goes up only once, it will be less! . We...",4767,140
2399,trigram,2019,adventure_cove,10,positive,paling spesial wahana,the most special vehicle,"Sangat menyenangkan , banyak wahana menarik , ...","Very fun, many interesting rides, clean places...",4804,29


In [None]:
# this cell takes approx 1 min to run
save_excel_sheet(df_sentiment, '/content/gdrive/MyDrive/data/output/attractions_stanfordnlp_v13.xlsx', 'ngram_review_en', index=False)

## Visualisation in Tableau
https://public.tableau.com/app/profile/odelia.tan/viz/attractions_master_pos_v3_extract/Presentation