# Reviews Data Processing

written by: Muhammad Angga Muttaqien | muha.muttaqien@gmail.com

## Data Preparation

In [2]:
import os
import re, string, unicodedata
import nltk
import Sastrawi
import contractions
import inflect
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

# for processing indonesian text
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

%matplotlib inline

In [3]:
# my own stemmer
from stemmer import IndonesianStemmer
from stemmer import EnglishStemmer

#### XML Processing

In [4]:
import xml.etree.ElementTree as et

In [5]:
tree = et.parse('./datasets/training_set.xml')
root = tree.getroot()

In [6]:
reviews_corpus = []
labels_corpus = []
text_corpus = []

# grab all XML contents
for review in root.findall('review'):
    rid = review.get('rid')
    text = review.find('text').text 
    
    label = ""
    for aspects in review.findall('aspects'):
        id = aspects.get('id')
        
        food, price, service, ambience = 0, 0, 0, 0
        if id == '0':
            label = (food, price, service, ambience)
            
            for aspect in aspects.findall('aspect'):
                category = aspect.get('category')
                polarity = aspect.get('polarity')
                
                if category == "FOOD":
                    if polarity == 'POSITIVE': food = 1
                    else: food = -1
                elif category == "PRICE":
                    if polarity == 'POSITIVE': price = 1
                    else: price = -1
                elif category == "SERVICE":
                    if polarity == 'POSITIVE': service = 1
                    else: service = -1
                elif category == "AMBIENCE":
                    if polarity == 'POSITIVE': ambience = 1
                    else: ambience = -1

                label = (food, price, service, ambience)
                
            labels_corpus.append(label)
        
            
    text_corpus.append(text)

In [7]:
len(labels_corpus), len(text_corpus)

(3865, 3865)

In [8]:
labels_corpus = labels_corpus[0:]
text_corpus = text_corpus[0:]

In [9]:
def display_reviews(corpus):
    for id, content in enumerate(corpus):
        print("{}) {}\n".format(id+1, content))

In [10]:
display_reviews(labels_corpus)

1) (1, 0, 0, 1)

2) (1, 0, 0, 1)

3) (1, 0, 0, 0)

4) (1, 0, 0, 0)

5) (1, -1, 0, 1)

6) (1, 1, 0, 0)

7) (1, 0, 0, 0)

8) (1, 0, -1, 1)

9) (1, 0, 0, 1)

10) (1, 0, 0, 0)

11) (0, 1, 0, 1)

12) (1, 0, 0, 1)

13) (0, 0, 0, 1)

14) (1, 1, 1, 1)

15) (1, 0, 0, -1)

16) (0, 0, 0, 0)

17) (0, 0, 0, 1)

18) (1, 0, 0, -1)

19) (1, 0, 1, -1)

20) (1, -1, 1, 0)

21) (1, -1, 1, 1)

22) (1, 0, 1, 1)

23) (1, 1, 0, -1)

24) (1, 0, 0, 0)

25) (0, 0, -1, -1)

26) (1, 1, 0, 1)

27) (1, 0, -1, 1)

28) (1, -1, 0, 0)

29) (1, 1, 0, 0)

30) (1, 1, 1, 1)

31) (1, -1, -1, -1)

32) (1, 0, 0, 0)

33) (1, 0, 0, 1)

34) (1, 1, 0, 0)

35) (1, 0, 0, 0)

36) (1, 0, 1, 1)

37) (1, 0, 0, 1)

38) (1, 1, 0, 0)

39) (-1, -1, 0, 0)

40) (0, 0, 0, 0)

41) (1, 0, 0, 1)

42) (1, 0, 0, 1)

43) (-1, -1, 0, 0)

44) (1, 0, 0, 1)

45) (1, 0, 0, 0)

46) (1, 0, 0, 0)

47) (-1, 1, 1, 1)

48) (1, 1, 1, 0)

49) (1, 1, 0, 1)

50) (1, 0, 1, 0)

51) (1, 0, 0, 0)

52) (1, 1, 1, 1)

53) (1, 1, 0, 1)

54) (1, 0, 0, 0)

55) (1, 0, 0, 1)


561) (1, 0, 0, 0)

562) (1, 0, 0, 0)

563) (1, 0, 1, 1)

564) (1, 0, 0, 0)

565) (0, 0, 0, 1)

566) (0, 0, 0, 0)

567) (1, 0, 0, 0)

568) (1, 0, 0, 0)

569) (1, 0, 0, 0)

570) (1, 0, 0, 1)

571) (1, 0, 0, 1)

572) (1, 0, 0, 0)

573) (1, 0, 1, 0)

574) (1, 0, 0, 0)

575) (-1, -1, 1, 0)

576) (1, 0, 0, -1)

577) (1, 1, 0, 1)

578) (1, 0, 0, 0)

579) (-1, 0, -1, 0)

580) (1, 0, 0, 0)

581) (1, 0, 1, 1)

582) (1, 1, 0, 0)

583) (1, 1, 1, 1)

584) (1, 0, 1, 1)

585) (1, 1, 0, 0)

586) (1, 0, 1, 0)

587) (1, 0, 1, 1)

588) (1, 1, 1, 0)

589) (1, 1, 0, 0)

590) (1, 0, 0, 1)

591) (1, 0, 0, 0)

592) (1, 0, 1, 1)

593) (1, 1, 0, 0)

594) (1, 0, 0, 0)

595) (1, 0, 1, 1)

596) (1, 1, 0, -1)

597) (1, 1, 0, 1)

598) (1, 0, 0, 1)

599) (-1, -1, 0, 0)

600) (-1, -1, 0, 0)

601) (1, 1, 0, 1)

602) (1, 1, 0, 0)

603) (1, 0, 0, 0)

604) (1, 0, 0, 1)

605) (1, -1, 0, 0)

606) (1, -1, 1, 1)

607) (1, 1, 0, 0)

608) (1, 1, 0, -1)

609) (1, 1, 0, 0)

610) (1, 1, 0, 1)

611) (1, 1, 1, 0)

612) (1, 1, 0, 0)


2152) (1, 0, 1, 0)

2153) (1, 0, 0, 0)

2154) (0, 1, 0, 1)

2155) (1, 0, 0, 1)

2156) (1, 0, 0, 0)

2157) (1, 0, 0, 1)

2158) (1, 0, 1, 1)

2159) (1, 1, 1, 1)

2160) (1, 0, 1, 1)

2161) (1, -1, 0, 0)

2162) (1, 0, 0, 0)

2163) (1, 0, -1, -1)

2164) (1, 0, 1, 1)

2165) (1, 0, 1, 1)

2166) (1, 0, 0, 0)

2167) (1, 0, 0, 1)

2168) (1, 0, 0, 0)

2169) (1, 0, -1, 0)

2170) (1, 0, 0, 1)

2171) (1, 0, 0, 1)

2172) (1, 0, 0, 0)

2173) (1, 0, 0, 0)

2174) (1, 0, 1, 1)

2175) (1, 1, 1, 1)

2176) (1, 0, 0, 0)

2177) (1, 0, 0, 0)

2178) (1, 0, 0, 0)

2179) (1, 0, 0, 0)

2180) (1, 1, 0, 0)

2181) (1, -1, 0, 0)

2182) (1, 1, 0, 0)

2183) (1, 0, 0, 0)

2184) (1, 0, -1, 1)

2185) (0, 0, 0, 1)

2186) (1, 0, 0, 0)

2187) (1, 0, 1, 0)

2188) (1, 0, 0, 1)

2189) (1, 0, 0, -1)

2190) (1, 1, -1, 1)

2191) (1, 1, 0, 0)

2192) (-1, -1, -1, -1)

2193) (1, -1, 0, -1)

2194) (1, 0, 0, -1)

2195) (-1, -1, 0, 0)

2196) (1, 0, 1, 1)

2197) (1, 1, 1, 0)

2198) (0, 0, 0, 1)

2199) (1, 0, -1, 0)

2200) (-1, 0, 0, 1)

2

3528) (1, 0, 0, 0)

3529) (1, 0, 0, -1)

3530) (1, 0, 0, 1)

3531) (1, 0, 0, 1)

3532) (1, 0, 0, 0)

3533) (-1, 0, 0, 0)

3534) (1, 0, 0, 0)

3535) (1, 0, 1, 1)

3536) (1, 0, 0, 0)

3537) (1, 0, 0, -1)

3538) (1, 0, 0, 0)

3539) (1, 0, 1, 1)

3540) (1, 1, 0, -1)

3541) (1, 0, -1, 0)

3542) (1, 0, 0, -1)

3543) (1, 0, 1, 0)

3544) (1, 0, 0, 0)

3545) (1, 1, 0, 0)

3546) (1, -1, 0, 0)

3547) (1, 0, -1, 0)

3548) (1, 0, 0, 0)

3549) (-1, 0, 0, 1)

3550) (1, 0, 1, 0)

3551) (1, 1, 1, 1)

3552) (1, 0, 0, 1)

3553) (1, 0, -1, -1)

3554) (-1, 0, 0, 0)

3555) (1, 0, -1, 1)

3556) (1, 1, 0, 1)

3557) (1, 0, 1, 1)

3558) (1, 0, 0, 0)

3559) (-1, 1, 1, 0)

3560) (1, 0, -1, 0)

3561) (1, 1, 1, 1)

3562) (1, 1, 0, 0)

3563) (1, 1, 0, 0)

3564) (1, 0, 0, 1)

3565) (1, 0, 0, 0)

3566) (1, 1, 0, 1)

3567) (1, 1, 0, 0)

3568) (0, 0, 0, 0)

3569) (1, 0, 1, 1)

3570) (1, 0, 1, 1)

3571) (1, 0, 1, 1)

3572) (1, -1, 0, 0)

3573) (1, 0, 1, 0)

3574) (1, 0, 0, 0)

3575) (1, 0, 0, 0)

3576) (1, 0, 1, 1)

3577

In [11]:
display_reviews(text_corpus)        

1) I love the concept. I feel like in swiss traditional market. The place is amazing. The food is awesome. But, in my opinion, they need to make a change/rotation in menu or even new menu. I choose this place for lunch frequently. Sometimes I feel bored with the menu.  Overall, thanks Marche for the delicious food, also the nice place.

2) Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang yang orang2 bilang enak. Dan emang beneran enak sih nasi gorengnya wkkw suasana nya juga enak buat makan ramai2 gitu.

3) Suka sama bebek ini karna dulu d ajak tmn makan di sini, ehh malah jd ketagihan sama dagingnya yg empuk dan sambel mentah nya yg dasyatttt    Dulu tempatnya masih tenda, sekarang udh ada kiosnya, kursinya lumayan banyak ada toilet nya juga..    Kalo makan bebek ini selalu order dua bebek, nasi uduk, sate rempela, sambel mentah ekstra pedas dan es teh manis, sambel mentah nya bisa request pedasnya..

4) Very good and very delish!!! Gokils deh enaknya... Highly Recommen


565) Vietnamenese food.  Lokasinya persis di depan toko barang antik di jalan surabaya, bersebelahan dengan cali deli yang ternyata adalah 1 management.  Interiornya cukup menarik model vietnam / chinese resto, banyak lukisan dan juga pajangan.  Utk jenis makanannya ga terlalu banyak, mereka specialnya adalah pho noddle.   Karena lg malas makan mie akhirnya pesen grilled chicken, modek kyk chicken steak, ada salad dan juga tahu jamur dibentuk segiempat dan kuah bening, sambel asam cirikhas vietnam.  Harga makanannya cukup standar sekitar 50 rban, harga minumannya yg cukup mahal menurut saya sekitar 35-50 rban.  Mereka juga nyediaiin makanan pembuka kayak spring roll.  Untuk segi rasa buat saya not bad but also not too special but recommended to try.

566) This is the very first time i taste a same quality taste flat white, like I had in Melbourne. But, when i came for the second time, the taste was ""so-so"". Well, like they said, it depends on the barista?  You guys need to make sure

1873) Makanan khas pekalongan.    Hidangan spesialnya megono pincuk, nasi dengan potongan nangka kecil dan halus, rasanya sedikit pedas, sriping (kerang goreng) , pecak cucut dan cumi hitem, garang asem ( semacam rawon), tauto soto dengan campuran tauco.  Kesukaan saya disini ayam goreng bumbu pekalongan.    Sambil menunggu makanan tersedia beraneka macam gorengan tahu, tempe, tahu isi dan pisang.    Untuk minuman es mangga boleh dicoba.    Tempatnya tidak begitu besar, hampir selalu ramai tiap hari karena makanannya enak dan harganya terjangkau.      Harus dicoba.

1874) Special yang suka bgt High Tea,aku saranin disini viewnya langsung ke arah bundaran HI.. Mulai dr Teanya sampe cakenya enak, pelayanya juga not bad lah :)

1875) The first time i went here my first impression were a tiny cafe with warmth atmosphere. It's beautiful how the interior made up into small space.   The food especially for the waffle (salt caramel) are totally gooooood!! One of the best waffle i have been tas

3245) Jadi ceritanya bingung mau kemana dan temen nyaranin yg tempat makan rooftop dan kebetulan lewatin kemang colony, yaudah karena emang gak tau gimana tempatnya dengan pedenya kesana aja, so far liat2 disana tempat emang cozy bgt dan asik tp kalo malem terlalu remang, price nya gk usah ditanya daerah kemang yah mahal itu gk dipungkiri, makanan sm minuman emang mahal bgt, jadi yah cm minum sama snack aja, itupun abis 300k dn gak kenyang, secara makanan gua gak bisa nilai karena gua gak pesen karena liat price nya udh shock dluan, tapi masalah tempat dan pelayanan bagus banget. So, gua cuma nikmatin tempatnya aja,tadi niatnya mau ke parc19, tp karena gatau ada dresscode rules dan cowo gw pake sendal, jadi gabisa masuk. malah nyasar ke fjon7. tempatnya asik buat nongkrong dan sekedar nyantai. suasananya enak banget sih. tadi cm pesen minum, pesen mocktailnya yang rasperberry lemonade sama fruit punch. yang rasperberry asem banget, yang fruit punch enak. untuk ikutan mocktail dia murah

In [12]:
for id, _ in enumerate(labels_corpus):
    reviews_corpus.append([labels_corpus[id], text_corpus[id]])

In [13]:
display_reviews(reviews_corpus)

1) [(1, 0, 0, 1), 'I love the concept. I feel like in swiss traditional market. The place is amazing. The food is awesome. But, in my opinion, they need to make a change/rotation in menu or even new menu. I choose this place for lunch frequently. Sometimes I feel bored with the menu.  Overall, thanks Marche for the delicious food, also the nice place.']

2) [(1, 0, 0, 1), 'Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang yang orang2 bilang enak. Dan emang beneran enak sih nasi gorengnya wkkw suasana nya juga enak buat makan ramai2 gitu.']

3) [(1, 0, 0, 0), 'Suka sama bebek ini karna dulu d ajak tmn makan di sini, ehh malah jd ketagihan sama dagingnya yg empuk dan sambel mentah nya yg dasyatttt    Dulu tempatnya masih tenda, sekarang udh ada kiosnya, kursinya lumayan banyak ada toilet nya juga..    Kalo makan bebek ini selalu order dua bebek, nasi uduk, sate rempela, sambel mentah ekstra pedas dan es teh manis, sambel mentah nya bisa request pedasnya..']

4) [(1, 0, 0, 0

599) [(-1, -1, 0, 0), 'Karena lg jalan d CP dan merasa haus, akhirny cb beli comebuy, pertama maunya yg matcha greentea tp lg kosong, trs ganti yg taro topping bubble tp katanya bubble lg dimasak, jd ganti lg topping red bean,, rasany biasa aja dan es baloknya yg bejibun sedangkan airny cm stg gelas g rasa dengan harga 28k ! Mending g bli cha*ime lah next time..']

600) [(-1, -1, 0, 0), 'selalu suka sama bubblenya comebuy, kalo ga suka terlalu manis bisa minta less sugar sampe 30% . Pesen green milk tea, greenteanya berasa banget']

601) [(1, 1, 0, 1), 'HIDDEN GEMS!!!    Kopi di Tuku enak banget sih aku sampe terharu hehehe. Aku nyobain kopi di sini pas gak sengaja lagi berada di Cipete Raya. Penasaran banget karena toko mungil di pojokan tapi kok rame banget yang antri. Lalu aku pesan Iced Latte tanpa gula.......and it was suprisingly fascinating and...affordable ! Dengan harga around 25k-30k aja kita udah bisa dapet kopi premium, sedangkan untuk kopi Tetangga (thats what they called 

1906) [(0, 0, 1, 1), 'Saya pecinta makanan Indonesia dan I really really recommended dengan Lumpang Emas ini.  Suasananya sangat hangat,hommie dengan interior vintage.  Asik banget buat kongkow bareng teman atau keluarga.  Makanannya ada yang ramesan dan ada juga yang ngelauk.  Banyak sekali pilihannya.\xa0  Berasa keliling nusantara dalam satu piring deh..(hahaha...)  Dan yang paling penting adalah bisa roko! ;d  Jadi wajib banget dateng kesini.']

1907) [(1, 1, 0, -1), "The place is Ok.. The price is about US$3.. My fav is Mie Aceh Rebus.. U'll like it..   I promise u.. Pls try to come at nite as there is no AC, so it would quite hot during the day.."]

1908) [(-1, -1, 0, 0), 'What a disappointment. I cannot believe that people actually like this food. It is tasteless. Not worth the price. Will never eat there again. The place is infested with cockroaches']

1909) [(1, 0, 0, 1), "Came by during one hot sunny day in Gading and i was grateful that it felt cozy and cold. We chose the fi

2472) [(1, 1, 1, 0), 'Suasananya enak, pelayannya ramah dan pertama kalinya ke sini. Makanannya enakk semuaa. Ramennyaa kaldunya berasa banget. Salmon riceny jg enakk (lupa nama lengkapnya). Harganya masih worth it c krn porsiny jg kenyang. Its recommended 😄']


2474) [(-1, 0, 0, 0), "Alasan utama beli martabak ini krn penasaran.  Gw pesen yg blackforest cream cheese oreo setengah dan keju setengah. Tekstur martabaknya BANTET. Gw ga tau kalo emang bantet atau martabaknya memang pendek, tapi menurut gw, martabaknya kurang tinggi banget. Cream cheese oreo was good, agak terlalu berair, gak gimana banget enaknya, biasa lah, standard.   Yg keju, gw enek. Don't know why, gw enek, pdhl gw suka keju. Mungkin krn martabaknya pendek dan keju nya tebek banget.  Overall, gw rada kecewa. Tapi buat yg penasaran dan pengen nyoba banget boleh lah dicoba, siapa tau nempel di hati anda.  Once again, selera org beda2 ya kan..."]

2475) [(-1, 0, 0, 0), 'Bakmie Singapur di Greenville ini sangat terkenal d

#### Splitting english and indonesian training data

In [14]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

en_reviews_corpus = []
id_reviews_corpus = []
for id, review in enumerate(reviews_corpus):
    tokens = word_tokenize(review[1])
    added_vocab = ['tau', 'gue', 'saya', 'baru', 'gila', 'ga', 'paling', 'yang'] # manually add indo vocabularies
    
    if(tokens[0].lower() in english_vocab and (tokens[0].lower() not in added_vocab)):
        # print(tokens[0].lower())
        en_reviews_corpus.append([labels_corpus[id]," ".join(tokens)])
    else:
        id_reviews_corpus.append([labels_corpus[id]," ".join(tokens)])
        
print("Total training data: ", len(reviews_corpus))
print("English reviews: ", len(en_reviews_corpus))
print("Indonesian reviews:", len(id_reviews_corpus))

Total training data:  3865
English reviews:  1747
Indonesian reviews: 2118


In [15]:
en_processed_reviews = []
id_processed_reviews = []

In [16]:
display_reviews(id_reviews_corpus)

1) [(1, 0, 0, 1), 'Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang yang orang2 bilang enak . Dan emang beneran enak sih nasi gorengnya wkkw suasana nya juga enak buat makan ramai2 gitu .']

2) [(1, 0, 0, 0), 'Suka sama bebek ini karna dulu d ajak tmn makan di sini , ehh malah jd ketagihan sama dagingnya yg empuk dan sambel mentah nya yg dasyatttt Dulu tempatnya masih tenda , sekarang udh ada kiosnya , kursinya lumayan banyak ada toilet nya juga.. Kalo makan bebek ini selalu order dua bebek , nasi uduk , sate rempela , sambel mentah ekstra pedas dan es teh manis , sambel mentah nya bisa request pedasnya..']

3) [(1, 1, 0, 0), "Tempat dessert kelapa yang fresh banget ! Kalo kesini paling suka beli coco pouchnya sambil ngobrol '' '' sama temen '' '' .. Harga nya bersahabat banget , dan lumayan banyak isi coco pouch nya.. Paling suka coco pouch rasa honeydew asanya seger bangettt~ bener '' '' energy potion !"]

4) [(1, 0, 0, 0), "Tryin menya sakura ramen for the first time 


197) [(1, 1, 0, 0), 'Tempat yg udh gw datengin berkali2 . Favorite sm pacar krna dy penggila protein . Kesini sebulan seenggk ny sekali . Always pesen sirloin steak plus tenderloin steak import medium well , suka bgt sm mushroom sauce ny . N harga ny pun sudah termasuk murah , makan kenyangg..']

198) [(1, 0, -1, 0), 'Dapet referensi dari teman untuk makan disini dan disuru cobain mix wingsnya dan setelah gw order , ternyata memang enakk bgtttt ! ! ! Di menu mix wings ini terdapat 3 rasa yaitu Original wings , honey wings dan red wings ( pedes nya nendang ) . Diantara 3 rasa itu gw paling suka original , gurih2 asin rasanya . Gw bisa bilang klo chicken wings disini adl yg paling enak yg pernah gw coba . Tapi sayangnya , service nya agak kurang mendukung karna pada saat gw masih menikmati chicken wingsnya.. tiba2 gw di kasi bill dan langsung disuru bayar saat itu juga ( gak bisa menunggu sampai selesai makan dulu dan pada saat itu gw makan pakai tangan kosong ) karena mereka uda mau tu


252) [(1, 0, 0, 1), 'Tempat happening ala2 di daerah Kelapa Gading , selalu penuh dan waiting list padahal kapasitas kursi dan meja yang disedian lumayan banyak . Sistemnya , pesan di counter masing2 makanan . Yang paling terkenal disini es duren nya , mereka jual berbagai jenis es dengan topping kekinian , ini enak dan kalian harus coba ice durian cendol ( 30k ) , ice durian ori ( 25k ) . Dan yang gak kalah happeningnya adalah martabak dengan topping2 yang lagi hits belakangan ini macam nuttela , kitkat greentea , toblerone , you named it ... . Kemarin coba yang pandan dan nutella ( 103k ) . Kesini sih enaknya gak makan serius , jadi makan cemal cemil bareng temen2 jadi bisa sharing juga makannya . Interior dan Exteriornya juga lucu abis , kaya unfinished design gt . Jadi kalo lagi sepi , kalian bisa foto2 gaya disana .']

253) [(1, 0, 1, -1), 'Reservasi mudah cuma satu klik dari website , dan overall pelayanan juga bagus . Total cost berdua 700k . Cocktail yang kita order mojito dan


1431) [(1, -1, 0, -1), 'Baso nya bikin ketagihan .. kuah nya sedap dan baso nya mantap apalagi sambelnya .. ditemenin sama es campur saat kepedesan mantep banget .. sayang tempatnya ga ber AC jadi panas banget .. , bakso wonogiri yang ad di jakarta . Rasa kuahnya segar dan bakso nya enak . Akan tetapi menurut saya harga kurang worth lah ya dengan 1 porsi yang bisa mencapai 30k . Untuk seporsi bakso termasuk mahal sepertinya . But overall nice lah , lumayan enak juga kok']

1432) [(1, 0, 0, 1), 'Duh tmptnya dingin hehehehe norak banget deh gw . Gak luas tp nyaman . Non smoking area nya lebih kecil tmptnya . Sederetan sm loulou gelato . Krn gw liat di menu ada bbrp macam sambel , jdnya gw pesan iga penyet nya . Kt mas nya , yg paling pedes itu sambel korek , so that was what I chose . Dan tnyt ok jg itu sambel wlpn gak smp keringetan jg ( krn ac nya dingin ) . Iga nya enak , garing , tp kurang penyet deh hehehe.. Utk nasi , pilihannya ada nasi putih biasa , nasi merah , sm nasi uduk ( g

#### Splitting both labels

In [17]:
en_labels_corpus = []
id_labels_corpus = []

In [18]:
for corpus in en_reviews_corpus:
    en_labels_corpus.append(corpus[0])
    del corpus[0]

In [19]:
for corpus in id_reviews_corpus:
    id_labels_corpus.append(corpus[0])
    del corpus[0]

## Text Preprocessing

Since, text is the most unstructured form of all the available data, various types of noise are present in it and the data is not readily analyzable without any pre-processing. The entire process of cleaning and standardization of text, making it noise-free and ready for analysis is known as text preprocessing.

It is predominantly comprised of three steps:

1. Noise Removal
2. Lexicon Normalization
3. Object Standardization

#### 1. Noise removal

Any piece of text which is not relevant to the context of the data and the end-output can be specified as the noise. For example – language stopwords (commonly used words of a language – is, am, the, of, in etc), URLs or links, social media entities (mentions, hashtags), punctuations and industry specific words. This step deals with removal of all types of noisy entities present in the text.

In [20]:
# en stopwords
en_stopwords = stopwords.words('english')

# id stopwords
factory = StopWordRemoverFactory()
id_stopwords_remover = factory.create_stop_word_remover()

##### English

In [21]:
for id, review in enumerate(en_reviews_corpus):
    review = "".join(review)
    tokens = word_tokenize(review)
    
    review_list = [i.lower() for i in tokens if i not in en_stopwords]
    review_arr = " ".join(review_list)
    en_processed_reviews.append(review_arr)

In [22]:
# display_reviews(en_processed_reviews)

##### Indo

In [23]:
for id, review in enumerate(id_reviews_corpus):
    review = "".join(review)
    id_processed_reviews.append(id_stopwords_remover.remove(review))

In [24]:
# display_reviews(id_processed_reviews)

#### 2. Lexicon Normalization

Another type of textual noise is about the multiple representations exhibited by single word. For example – “play”, “player”, “played”, “plays” and “playing” are the different variations of the word – “play”, Though they mean different but contextually all are similar. The step converts all the disparities of a word into their normalized form (also known as lemma). Normalization is a pivotal step for feature engineering with text as it converts the high dimensional features (N different features) to the low dimensional space (1 feature), which is an ideal ask for any ML model.

The most common lexicon normalization practices are :

1. Stemming:  Stemming is a rudimentary rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word.
2. Lemmatization: Lemmatization, on the other hand, is an organized & step by step procedure of obtaining the root form of the word, it makes use of vocabulary (dictionary importance of words) and morphological analysis (word structure and grammar relations).

#### English

In [25]:
joint_review = []
lem = WordNetLemmatizer()
stem = LancasterStemmer()

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    tokens = word_tokenize(review)
    
    joint_token = []
    for token in tokens:
        token = lem.lemmatize(token, "v")
        token = stem.stem(token)
        joint_token.append(token)
        joint_token_str = " ".join(joint_token)
        
    joint_review.append(joint_token_str)

en_processed_reviews = joint_review

#### Indo

In [26]:
joint_review = []
idStemmer = IndonesianStemmer()

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    print("Input: %s\n"%review)
    joint_token_str = idStemmer.stem(review)
    joint_token_str = review
    print("Output: %s\n"%joint_token_str)

    joint_review.append(joint_token_str)

id_processed_reviews = joint_review

Input: Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang orang2 bilang enak . Dan emang beneran enak sih nasi gorengnya wkkw suasana nya enak buat makan ramai2 gitu .

Output: Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang orang2 bilang enak . Dan emang beneran enak sih nasi gorengnya wkkw suasana nya enak buat makan ramai2 gitu .

Input: Suka sama bebek karna dulu d ajak tmn makan sini , ehh malah jd ketagihan sama dagingnya yg empuk sambel mentah nya yg dasyatttt Dulu tempatnya tenda , sekarang udh kiosnya , kursinya lumayan banyak toilet nya juga.. Kalo makan bebek selalu order bebek , nasi uduk , sate rempela , sambel mentah ekstra pedas es teh manis , sambel mentah nya request pedasnya..

Output: Suka sama bebek karna dulu d ajak tmn makan sini , ehh malah jd ketagihan sama dagingnya yg empuk sambel mentah nya yg dasyatttt Dulu tempatnya tenda , sekarang udh kiosnya , kursinya lumayan banyak toilet nya juga.. Kalo makan bebek selalu order bebek , nas

Input: Makanannya enak banget , terjangkau . Tempatnya kekinian pelayanannya ramah . Cocok buat tempat nongkrong pokoknya . Dijamiin puas . Maknyuss ...

Output: Makanannya enak banget , terjangkau . Tempatnya kekinian pelayanannya ramah . Cocok buat tempat nongkrong pokoknya . Dijamiin puas . Maknyuss ...

Input: Makan Super Suikiaw berasa makan Hongkong China suasana restoran nya sama persis interiornya sedikit dapur di depan bersama meja tamu lalu banyak tamu2nya berbicara bahasa china ... menarik di restoran yaitu banyaknya tamu2 Jepang makan ... terlepas itu semua restoran memang layak dikunjungi cita rasa kelezatannya ... Suikiaw rebus panggang sama lezatnya pula makanan lainnya ... harga memang sedikit mahal konsumen dijamin terpuaskan cita rasa kelezatan yang ditampilkannya ... tirta-lie.blogspot.com

Output: Makan Super Suikiaw berasa makan Hongkong China suasana restoran nya sama persis interiornya sedikit dapur di depan bersama meja tamu lalu banyak tamu2nya berbicara bahasa


Output: Salah satu restaurant favorit Bogor ... Mulai suasana , rasa makanan , pelayanannya membuat orang2 pernah bosan datang restaurant walaupun banyak restaurant cafe baru bertebaran di Bogor ... Hal terbukti kali dan teman maupun keluarga berkunjung restaurant , selalu masuk dalam daftar waiting list ... Haha ... Untuk yang membawa kendaraan di restaurant tidak dikenakan biaya parkir sepeser ... ^^

Input: Bakmi konvensional paling mahal jakarta paling enak . Struktur mie nya moist gak blenyek . Sayurnya bersih . Daging/ ayam nya bersih tasty . Ngintip belakang , sanitasinya pake filter air .

Output: Bakmi konvensional paling mahal jakarta paling enak . Struktur mie nya moist gak blenyek . Sayurnya bersih . Daging/ ayam nya bersih tasty . Ngintip belakang , sanitasinya pake filter air .

Input: Makanannya cukup enak . Fish and chipsnya , wedgesnya cukup enak Steak signaturenya 3 Nasi goreng porsinya kecil Aglio olionya 3 Ngga baby chair Tapi satu ruangan yg cukup pas utk keluarga

Input: Pertama kali kesini . Tempatnya untik . Nuansa2 tempo dulu gitu . Tempatnya enak deket panggung jd bs santai dengerin lagu . Sy memesan moccachino nya . Untuk rasanya sendiri not bad . Sy sih suka soalnya kopinya ga terlalu strong coklatnya berasa . Cocok bersantai malam hari mengobrol san dengerin live music .

Output: Pertama kali kesini . Tempatnya untik . Nuansa2 tempo dulu gitu . Tempatnya enak deket panggung jd bs santai dengerin lagu . Sy memesan moccachino nya . Untuk rasanya sendiri not bad . Sy sih suka soalnya kopinya ga terlalu strong coklatnya berasa . Cocok bersantai malam hari mengobrol san dengerin live music .

Input: Enak sih , bebeknya gede.. Nasi bebas ngambilnya haha . Tapi kok makin kesini kayaknya makin biasa . Dulu awalnya enak banget . Trus apa harganya naik ? Haha . Tapi kalo dibandingin Kaleyo , bebek disini lebih enak sih.. Hehe . Oh , tempat makannya lebih manusiawi , ga rame amet kayak Kaleyo . # perbandingan # frontal ? ? ? ?

Output: Enak sih , be


Output: Satenya enak , tongsengnya enak . Untuk harga hhhmmmm ditolerir . Ga semahal djono yg harganya naujubilah . Asiknya disini bisa mesen setengah porsian . Tahu sumedangnya enak ! Dicocol ama sambel kecapnya enak banget ! Tapii tahu goreng 15rb termasuk lumayan harganya , jd kalau ga kepingin banget ga beli deh .

Input: Bakso malang favorit di Depok . Semua jenis bakso pangsitnya enaaak , berasa dagingnya , kuahnya enak . Bisa pilih paket komplit , spesial , biasa , kalo mau pilih sendiri pakai bakso/pangsit mana aja jumlahnya berapa bisa . Satu bakso/pangsit harganya Rp.3000 , bihun mie kuningnya gratis . Tempatnya cukup bersih nyaman .

Output: Bakso malang favorit di Depok . Semua jenis bakso pangsitnya enaaak , berasa dagingnya , kuahnya enak . Bisa pilih paket komplit , spesial , biasa , kalo mau pilih sendiri pakai bakso/pangsit mana aja jumlahnya berapa bisa . Satu bakso/pangsit harganya Rp.3000 , bihun mie kuningnya gratis . Tempatnya cukup bersih nyaman .

Input: Cafe e

#### 3. Object Standardization

Text data often contains words or phrases which are not present in any standard lexical dictionaries. These pieces are not recognized by search engines and models.

Some of the examples are – acronyms, hashtags with attached words, and colloquial slangs. With the help of regular expressions and manually prepared data dictionaries, this type of noise can be fixed, the code below uses a dictionary lookup method to replace social media slangs from a text.

1) Handling Appostrophes

To avoid any word sense disambiguation in text, it is recommended to maintain proper structure in it and to abide by the rules of context free grammar. When apostrophes are used, chances of disambiguation increases.
For example “it’s is a contraction for it is or it has”. All the apostrophes should be converted into standard lexicons.

##### English

In [27]:
appostrophes_dict = {"'s": "is", "'re": "are", "'m": "am", "'ve": "ve", "'d": "would", "'ll": "will", "'t": "ot", "nt": "not"}

In [28]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    tokens = word_tokenize(review)
    
    joint_token = [appostrophes_dict[token] if token in appostrophes_dict else token for token in tokens]
    joint_token_str = " ".join(joint_token)
    
    joint_review.append(joint_token_str)
    
en_processed_reviews = joint_review

In [29]:
# display_reviews(en_processed_reviews)

##### Indo

Nothing. There is no appostrophes behaviour in indonesian language

2) Removal of Punctuations

All the punctuation marks according to the priorities should be dealt with. For example: “.”, “,”,”?” are important punctuations that should be retained while others need to be removed.

##### English

In [30]:
punctuations_dict = '''!()-[]{};:'"\,<>./@#$%^&*_~''' # not includes !(),.?

In [31]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    processed_review = ""
    for token in review:
        if token not in punctuations_dict:
            processed_review = processed_review + token

    joint_review.append(processed_review)

en_processed_reviews = joint_review

In [32]:
# display_reviews(en_processed_reviews)

##### Indo

In [33]:
punctuations_dict = '''!()-[]{};:'"\,<>./@#$%^&*_~''' # not includes !(),.?

In [34]:
joint_review = []

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    processed_review = ""
    for token in review:
        if token not in punctuations_dict:
            processed_review = processed_review + token

    joint_review.append(processed_review)

id_processed_reviews = joint_review

In [35]:
# display_reviews(id_processed_reviews)

3) Removal of whitespace noise

There is a need to remove unneeded whitespace in a sentences like "because it is right  .", "Besides  , there is..."

##### English

In [36]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    processed_review = review.lower()
    processed_review = processed_review.replace(" .", ".")
    processed_review = processed_review.replace(" ,", ".")
    processed_review = processed_review.replace("  ", " ")
    processed_review = processed_review.replace("   ", " ")
    processed_review = processed_review.replace("    ", " ")
    
    joint_review.append(processed_review)

en_processed_reviews = joint_review

In [37]:
# display_reviews(en_processed_reviews)

##### Indo

In [38]:
joint_review = []

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    processed_review = review.replace(" .", ".")
    processed_review = processed_review.replace(" ,", ".")
    processed_review = processed_review.replace("  ", " ")
    processed_review = processed_review.replace("   ", " ")
    processed_review = processed_review.replace("    ", " ")
    
    joint_review.append(processed_review)

id_processed_reviews = joint_review

In [39]:
# display_reviews(id_processed_reviews)

4) Standardizing words

Sometimes words are not in proper formats. For example: “I looooveee you” should be “I love you”. Simple rules and regular expressions can help solve these cases. Also, remove emoji content.

##### English

In [40]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    review = ''.join(''.join(s)[:2] for _, s in itertools.groupby(review))
    tokens = word_tokenize(review)
    
    joint_token = []
    for token in tokens:
        if token not in english_vocab:
            token = ''.join(''.join(s)[:1] for _, s in itertools.groupby(token))
    
        joint_token.append(token)
    
    joint_token_str = " ".join(joint_token)
    
    # remove emoji content
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

    joint_token_str = emoji_pattern.sub(r'', joint_token_str)
    
    joint_review.append(joint_token_str)

en_processed_reviews = joint_review

In [41]:
# display_reviews(en_processed_reviews)

##### Indo

In [42]:
joint_review = []

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    review = ''.join(''.join(s)[:2] for _, s in itertools.groupby(review))
    tokens = word_tokenize(review)
    
    joint_token = []
    for token in tokens:
        if token not in english_vocab:
            token = ''.join(''.join(s)[:1] for _, s in itertools.groupby(token))
    
        joint_token.append(token)
    
    joint_token_str = " ".join(joint_token)   
    
    # remove emoji content
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

    joint_token_str = emoji_pattern.sub(r'', joint_token_str)
    
    joint_review.append(joint_token_str)

id_processed_reviews = joint_review

In [43]:
# display_reviews(id_processed_reviews)

#### Display both training data

In [44]:
# uncomment this to display the last processed text for english
display_reviews(en_processed_reviews)

1) i lov conceiv i feel lik swiss tradit market the plac amaz the food awesom but opin nee mak changerotation menu ev new menu i chos plac lunch frequ sometim i feel bor menu overal thank march delicy food also nic plac

2) very good del gokil deh enakny high recommend gyut sem ig sap ayam pangang good dessert also good

3) best plac dat someon good amby nic intery dec pric us best hamburg favorit alfredo carbonar print hous seharusny say rat 50 it afford wafl long pric wor anym

4) ches cak nya juar lembut tempatny enak cozy parkirany pun lumay lua

5) be twic waktu itu gue ke sin pas udah rad malem as know wargih in selalu ram gapernah sep

6) gre conceiv rel nam marche the food vary ar many i alway com roast chick pric nt high good intery

7) best bingsu yg pernah gue cob pal mirip sam asliny yg di kore pes patbingsu shav iceny keras bgt milkny dan red beany buk yg kas tap yg hal emang kha koreany bgt untuk bungeopangny sih terlalu banyak tepungny tap buat bingsuny sih recommend ab 


451) be coupl tim food real good street food outdo are u nev expect food wel serv tak tim wait worthfultried philychesesteak sandwich tast perfect sush real good fresh also pho nodl so auth dimsumchicken fet so yummy hakau perfect shumayso gre try sop kepal salmon sapo tahugod

452) the food good seafod tast fresh must try kepit lad hitam go sev tim serv food alway satisfy

453) nev fail comfort food die stub germ resto bar gre way ear din holiday we ord roast saus steak alway del what pleas surpr soup we ord lentil saus soup wel cream spinach soup very luscy good

454) less crowd auth cle plac just eat shaokao tast auth shaokao i eat jakart from appear simil origin shaokao i oft eat chin although us less mal origin on term tast closest savoury i tast subtl hint mal flav it nt us bq sauc lik shaokao plac might suit peopl is tastebud suit min though howev lat try someth new right ? for lov savoury sal dish i recommend origin pork samch cutlef chick for lean sweet sid i suggest try gril

In [45]:
# uncomment this to display the last processed text for indonesian
display_reviews(id_processed_reviews)

1) Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang orang2 bilang enak Dan emang beneran enak sih nasi gorengnya wkw suasana nya enak buat makan ramai2 gitu

2) Suka sama bebek karna dulu d ajak tmn makan sini eh malah jd ketagihan sama dagingnya yg empuk sambel mentah nya yg dasyat Dulu tempatnya tenda sekarang udh kiosnya kursinya lumayan banyak toilet nya juga Kalo makan bebek selalu order bebek nasi uduk sate rempela sambel mentah ekstra pedas es teh manis sambel mentah nya request pedasnya

3) Tempat dessert kelapa fresh banget Kalo kesini paling suka beli coco pouchnya ngobrol sama temen Harga nya bersahabat banget lumayan banyak isi coco pouch nya Paling suka coco pouch rasa honeydew asanya seger banget bener energy potion

4) Tryin menya sakura ramen for the first time with bunch of my friends First we re bit doubting but the sign in front of resto quite big sayin that ` Japan no 1 Ramen so we decided to give it a try Most of us tryin the tonkotsu ramen the spicy


89) Tempat asik buat nongkrong santai  Tempatnya homey banget harga affordable makananya enak Lebih baik reservasi dulu sih kalo kesini

90) Roti bakarnya enak pke topping gtu apalg pake topping ice cream sama minumnya paling enak sih milkshake ovaltine menu mi kukus pedas enak pedesnya pas bgt ? ? ` im not a Japanese food holic i dont like even never eat Shushi hahaha ga kekinian bgt yah emang ga suka sih but maybe SOMEDAY i will try to eat a piece i ate here because i like its takoyaki and okonomiyaki the taste is quite good but sometines it was too salty and the price was so cheap

91) Gado2 komplit fu yung hay nya enak Tapi sayang suami menungu cukup lama bisa menyantap 2 hidangan tersebut Padahal itu restonya sepi Cuma kami berdua sedang makan siang Untuk harga standar Gak mahal ` I forgot who told me that the Betawi Soup at Sate Eltoro is good and somehow it sticks in my mind Since it s not too far from where I work today I asked my office boy to get it for my lunch Soto Betawi 


609) Murah Salah satu tempat alternatif makan sushi kalau duit tipis sekali Walau murah rasanya oke kok Wala senak sushi ` kelas atas overall pelayanan oke Harga murah Tempatnya bersih Favorit menu volcano roll ❤️

610) Restoran bertema mexican yg di PanglimaPolim menyajikan berbagai makanan nachos burrito quesadilas tortadas harga yg dibilang cukup wajar utk makanan ala mexico Interiornya vintage2 gitu cukup unik sih gatau knp suka aja bikin nyaman Tempat nya nyempil kecil jadi kurang pas kalo mau makan yg rame2 banget enak nyantai Service nya bagus org2 nya friendly Recomended deh para fodies yg newbie sama mexican food

611) Started my morning quite heavily Had their Bef Souvlaki for breakfast and mango apple tea I think It s delicious but was full halfway through The cafe is hidden so you really need to find it but overall a perfect cafe for a get together session Rather a peaceful cafe Awesome

612) Liberica coffee strong smooth and perfect A great spot for a quick pick me up lat

1307) Shisha terbaik pernah gue rasain disini Menunya unik unik pelayananya rapih asepnya tebel rasanya dapet pokoknya musti coba buat shisha addict Kesini selalu beli yg level 0 karna pengen nyobanyoba aja sih Dan pedesnya lebih indomie pake sambelnya Tapi enak porsinya banyak sayuran sosisnya banyak Tempat sempit parkirnya Kalo kesini mendingan bawa pulang aja Bisa via gojek D

1308) Seneng banget kesini cobain laksanya suka banget Tempatnya keren interiornya enak pandang mata pelayananya oke ramah cepat tempatnya terkenal sih bogor jadi kesini weekend puasa tempatnya rame banget sempet kena wl gak lama

1309) Sedihnya kesini tempatnya ga sebagus difotofoto yg seliweran yg pernah gw liat tapi enaklah duduk duduk ucul kalo kesini mendingan sih malem lebih adem Pas kesini sih ga nyobain banyak makananya cm coba corn soup sama leci tea nya rasanya mayan lah deh kapan kapan kesini pagi cobain yg hehe

1310) Pertama kali kesini Tempatnya untik Nuansa2 tempo dulu gitu Tempatnya enak deket 

In [46]:
len(en_labels_corpus), len(en_processed_reviews), len(id_labels_corpus), len(id_processed_reviews)

(1747, 1747, 2118, 2118)

In [47]:
labels_corpus = en_labels_corpus + id_labels_corpus
processed_reviews = en_processed_reviews + id_processed_reviews

#### Train data

In [48]:
food = []
price = []
service = []
ambience = []
for id, label in enumerate(labels_corpus):
    food.append(label[0])
    price.append(label[1])
    service.append(label[2])
    ambience.append(label[3])
    
train_reviews = {'food': food, 'price': price, 'service': service, 'ambience': ambience, 'review': processed_reviews}

In [49]:
df_review = pd.DataFrame(data=train_reviews)    
df_review = df_review[['review', 'food', 'price', 'service', 'ambience']]
df_review.head()

Unnamed: 0,review,food,price,service,ambience
0,i lov conceiv i feel lik swiss tradit market t...,1,0,0,1
1,very good del gokil deh enakny high recommend ...,1,0,0,0
2,best plac dat someon good amby nic intery dec ...,1,-1,0,1
3,ches cak nya juar lembut tempatny enak cozy pa...,1,0,0,1
4,be twic waktu itu gue ke sin pas udah rad male...,1,0,0,0


In [50]:
train_text = df_review['review']
train_labels = df_review[['food', 'price', 'service', 'ambience']]

#### test data

In [51]:
tree = et.parse('./datasets/validation_set.xml')
root = tree.getroot()

In [52]:
test_reviews_corpus = []
test_labels_corpus = []
test_text_corpus = []

# grab all XML contents
for review in root.findall('review'):
    rid = review.get('rid')
    text = review.find('text').text 
    
    label = ""
    for aspects in review.findall('aspects'):
        id = aspects.get('id')
        
        food, price, service, ambience = None, None, None, None
        
        label = (food, price, service, ambience)
        for aspect in aspects.findall('aspect'):
            category = aspect.get('category')
            polarity = aspect.get('polarity')

            if category == "FOOD":
                if polarity == 'POSITIVE': food = 1
                else: food = -1
            elif category == "PRICE":
                if polarity == 'POSITIVE': price = 1
                else: price = -1
            elif category == "SERVICE":
                if polarity == 'POSITIVE': service = 1
                else: service = -1
            elif category == "AMBIENCE":
                if polarity == 'POSITIVE': ambience = 1
                else: ambience = -1

            label = (food, price, service, ambience)

        test_labels_corpus.append(label)           
            
    test_text_corpus.append(text)

In [53]:
for id, _ in enumerate(test_labels_corpus):
    test_reviews_corpus.append([test_labels_corpus[id], test_text_corpus[id]])

In [54]:
food = []
price = []
service = []
ambience = []
for id, label in enumerate(test_labels_corpus):
    food.append(label[0])
    price.append(label[1])
    service.append(label[2])
    ambience.append(label[3])
    
test_reviews = {'food': food, 'price': price, 'service': service, 'ambience': ambience, 'review': test_text_corpus}

In [55]:
df_review_test = pd.DataFrame(data=test_reviews)
df_review_test = df_review_test[['review', 'food', 'price', 'service', 'ambience']]
df_review_test.fillna(0, inplace=True)
df_review_test.head()

Unnamed: 0,review,food,price,service,ambience
0,Iseng banget kesini sama temen karena udah la...,1.0,0.0,1.0,1.0
1,Ke patbingsoo karena pengen coba sojunya . Pe...,0.0,-1.0,0.0,0.0
2,Restoran babi guling a la Bali ini cukup meny...,1.0,0.0,0.0,0.0
3,Cafe dengan konsep unik menjual segala jenis ...,1.0,0.0,1.0,1.0
4,Kalo ke BSD wajib kesini . Ada di lantai 2 br...,1.0,0.0,-1.0,1.0


In [56]:
test_text = df_review_test['review']
test_labels = df_review_test[['food', 'price', 'service', 'ambience']]

## Text to Features

**Text to Features (Feature Engineering on text data)**

1) Syntactical Parsing
- Dependency Grammar
- Part of Speech Tagging

2) Entity Parsing
- Phrase Detection
- Named Entity Recognition
- Topic Modelling
- N-Grams

3) Statistical features
- TF – IDF
- Frequency / Density Features
- Readability Features

4) Word Embeddings

## Text Classification

#### Support Vector Machine

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [58]:
# create feature vectors 
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)

# apply train feature vectors
train_vectors = vectorizer.fit_transform(train_text)

# apply test feature vectors
test_vectors = vectorizer.transform(test_text)

In [59]:
def execute_model(label):
    
    print("--- %s label ---" % label)
    svm_classifier = svm.SVC(kernel='linear') 
    svm_classifier.fit(train_vectors, train_labels[label]) 

    prediction = svm_classifier.predict(test_vectors)

    print("Train Accuracy: ", accuracy_score(train_labels[label], svm_classifier.predict(train_vectors))*100)

    print("Test Accuracy: ", accuracy_score(test_labels[label], prediction)* 100)

    cnf_matrix = confusion_matrix(test_labels[label], prediction)
    print("Confusion matrx: \n",cnf_matrix)


In [60]:
for label in ['food', 'price', 'service', 'ambience']:
    execute_model(label)

--- food label ---
Train Accuracy:  88.5640362225097
Test Accuracy:  86.0
Confusion matrx: 
 [[ 1  0  1]
 [ 1  0  3]
 [ 2  0 42]]
--- price label ---
Train Accuracy:  91.33247089262613
Test Accuracy:  82.0
Confusion matrx: 
 [[ 0  1  3]
 [ 0 32  1]
 [ 0  4  9]]
--- service label ---
Train Accuracy:  90.86675291073739
Test Accuracy:  78.0
Confusion matrx: 
 [[ 0  3  0]
 [ 0 21  0]
 [ 0  8 18]]
--- ambience label ---
Train Accuracy:  87.91720569210867
Test Accuracy:  80.0
Confusion matrx: 
 [[ 0  1  0]
 [ 0 14  4]
 [ 0  5 26]]


#### Naive Bayes Classifier

In [61]:
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob

training_corpus = []
for id, en_review in enumerate(en_processed_reviews):
    training_corpus.append((en_review, en_labels_corpus[id]))

test_corpus = []

nb_classifier = NBC(training_corpus)

In [62]:
print(('food', 'price', 'service', 'ambience'))
print(nb_classifier.classify("enjoy place fatty tho berry nic dessert myriad text"))

('food', 'price', 'service', 'ambience')
(1, 0, 0, 0)


## Testing Model

In [63]:
tree = et.parse('./datasets/test_set.xml')
root = tree.getroot()

In [64]:
rl_test_reviews_corpus = []
rl_test_text_corpus = []

# grab all XML contents
for review in root.findall('review'):
    rid = review.get('rid')
    text = review.find('text').text 
    
    label = ""        
    rl_test_text_corpus.append(text)

In [65]:
for id, _ in enumerate(rl_test_text_corpus):
    rl_test_reviews_corpus.append(rl_test_text_corpus[id])

In [66]:
rl_test_reviews = {'review': rl_test_reviews_corpus}

In [67]:
df_review_rl_test = pd.DataFrame(data=rl_test_reviews)
df_review_rl_test.head()

Unnamed: 0,review
0,Bakmie jurangmangu ini penyelamat anak kos dar...
1,"It was ALL GOOD. The food, the interior, the p..."
2,Barusan beli Klaud's Soya puding (mungkin itu ...
3,"Menggunakan teknologi terkini, Genki Sushi men..."
4,A very old restaurant that's a favourite of my...


In [68]:
rl_test_text = df_review_rl_test['review']

# apply test feature vectors
rl_test_vectors = vectorizer.transform(rl_test_text)

In [69]:
def predict_rl_test():
    for label in ['food', 'price', 'service', 'ambience']:
        print("--- %s label ---" % label)
        svm_classifier = svm.SVC(kernel='linear') 
        svm_classifier.fit(train_vectors, train_labels[label]) 
        
        prediction = svm_classifier.predict(rl_test_vectors)
        
        df_review_rl_test[label] = prediction
        print("--- predicted!\n")

In [70]:
predict_rl_test()

--- food label ---
--- predicted!

--- price label ---
--- predicted!

--- service label ---
--- predicted!

--- ambience label ---
--- predicted!



In [71]:
df_review_rl_test

Unnamed: 0,review,food,price,service,ambience
0,Bakmie jurangmangu ini penyelamat anak kos dar...,1,0,0,1
1,"It was ALL GOOD. The food, the interior, the p...",1,0,0,1
2,Barusan beli Klaud's Soya puding (mungkin itu ...,1,0,0,0
3,"Menggunakan teknologi terkini, Genki Sushi men...",1,1,0,0
4,A very old restaurant that's a favourite of my...,1,0,0,0
5,"Roti unik, enak dan seruuuu...banyak pilihan r...",1,1,0,0
6,Kecap manis sudah menjadi salah satu bumbu waj...,1,0,0,0
7,"I tried baby chicken, pizza and spaghetti. The...",1,0,0,1
8,"Went here on wednesday around 12 pm, the place...",1,0,0,1
9,Having beer and lite meal is a pleasure when y...,1,0,0,1


In [72]:
df_review_rl_test.columns

Index(['review', 'food', 'price', 'service', 'ambience'], dtype='object')

In [73]:
for col in df_review_rl_test.columns:
    df_review_rl_test.loc[df_review_rl_test[col] == 1, col] = 'POSITIVE'
    df_review_rl_test.loc[df_review_rl_test[col] == 0, col] = ''
    df_review_rl_test.loc[df_review_rl_test[col] == -1, col] = 'NEGATIVE'

In [74]:
# df_review_rl_test.to_json("datasets/predicted_test_set_json")

In [75]:
df_review_rl_test.head()

Unnamed: 0,review,food,price,service,ambience
0,Bakmie jurangmangu ini penyelamat anak kos dar...,POSITIVE,,,POSITIVE
1,"It was ALL GOOD. The food, the interior, the p...",POSITIVE,,,POSITIVE
2,Barusan beli Klaud's Soya puding (mungkin itu ...,POSITIVE,,,
3,"Menggunakan teknologi terkini, Genki Sushi men...",POSITIVE,POSITIVE,,
4,A very old restaurant that's a favourite of my...,POSITIVE,,,


In [120]:
review_dict = df_review_rl_test[0:10].to_dict()

<review rid="1404">
    <text>
    I love the concept. I feel like in swiss traditional market. The place is amazing. The food is awesome. But, in my opinion, they need to make a change/rotation in menu or even new menu. I choose this place for lunch frequently. Sometimes I feel bored with the menu. Overall, thanks Marche for the delicious food, also the nice place.
    </text>
    <aspects id="0">
    <aspect category="FOOD" polarity="POSITIVE"/>
    <aspect category="AMBIENCE" polarity="POSITIVE"/>
    </aspects>
    <aspects id="1">
    <aspect category="FOOD" polarity="POSITIVE"/>
    <aspect category="AMBIENCE" polarity="POSITIVE"/>
    </aspects>
</review>

In [121]:
corpus = ""

In [128]:
for col, val in review_dict.items():
    for idx in range(len(review_dict["review"])):
        for key in ["review", "food", "price", "service", "ambience"]:
            corpus += "<"
            print(review_dict[key][idx])
            print("\n")

Bakmie jurangmangu ini penyelamat anak kos dari kelaparan. Kalo lagi laper banget kesini aja, porsinya gede banget, cukuplah untuk 3 orang normal (kalo gue agak ga normal). Biasa kalo kesini berdua sama temen, terus sepiring berdua karena porsinya gede banget. Tapi lebih sering delivery sih karena males kesana alias mager. Terus disana tempatnya kalo malem rame terus, daripada jauh-jauh kesana ga dapet tempat, mending delivery aja deh. Yang recommended disini Kwetiau Goreng Sapi.    Food 3.5/5.0  Service 3.5/5.0  Ambiance 2.0/5.0    Pinky Larasati


POSITIVE








POSITIVE


It was ALL GOOD. The food, the interior, the people, everything. I have no comparison to the other same-classed restaurants though, but I have to say that this is one of the best buffet place with quite a good deal


POSITIVE








POSITIVE


Barusan beli Klaud's Soya puding (mungkin itu kali namanya ya,haha) waktu sampe ke outlet nya, mbak yang jagain ramah banget, karena belom pernah coba, dia nyaranin cobain