# Reviews Data Processing

written by: Muhammad Angga Muttaqien | muha.muttaqien@gmail.com

## Data Preparation

In [1]:
import os
import re, string, unicodedata
import nltk
import Sastrawi
import contractions
import inflect
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

# for processing indonesian text
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [2]:
# my own stemmer
from stemmer import IndonesianStemmer
from stemmer import EnglishStemmer

#### XML Processing

In [3]:
import xml.etree.ElementTree as et

In [4]:
tree = et.parse('./datasets/training_set.xml')
root = tree.getroot()

In [12]:
reviews_corpus = []
labels_corpus = []
text_corpus = []

# grab all XML contents
for review in root.findall('review'):
    rid = review.get('rid')
    text = review.find('text').text 
    
    label = ""
    for aspects in review.findall('aspects'):
        id = aspects.get('id')
        
        food, price, service, ambience = 0, 0, 0, 0
        if id == '0':
            label = (food, price, service, ambience)
            
            for aspect in aspects.findall('aspect'):
                category = aspect.get('category')
                polarity = aspect.get('polarity')
                
                if category == "FOOD":
                    if polarity == 'POSITIVE': food = 1
                    else: food = -1
                elif category == "PRICE":
                    if polarity == 'POSITIVE': price = 1
                    else: price = -1
                elif category == "SERVICE":
                    if polarity == 'POSITIVE': service = 1
                    else: service = -1
                elif category == "AMBIENCE":
                    if polarity == 'POSITIVE': ambience = 1
                    else: ambience = -1

                label = (food, price, service, ambience)
                
            labels_corpus.append(label)
        
            
    text_corpus.append(text)

In [13]:
len(labels_corpus), len(text_corpus)

(3865, 3865)

In [14]:
labels_corpus = labels_corpus[0:]
text_corpus = text_corpus[0:]

In [15]:
def display_reviews(corpus):
    for id, content in enumerate(corpus):
        print("{}) {}\n".format(id+1, content))

In [16]:
display_reviews(labels_corpus)

1) (1, 0, 0, 1)

2) (1, 0, 0, 1)

3) (1, 0, 0, 0)

4) (1, 0, 0, 0)

5) (1, -1, 0, 1)

6) (1, 1, 0, 0)

7) (1, 0, 0, 0)

8) (1, 0, -1, 1)

9) (1, 0, 0, 1)

10) (1, 0, 0, 0)

11) (0, 1, 0, 1)

12) (1, 0, 0, 1)

13) (0, 0, 0, 1)

14) (1, 1, 1, 1)

15) (1, 0, 0, -1)

16) (0, 0, 0, 0)

17) (0, 0, 0, 1)

18) (1, 0, 0, -1)

19) (1, 0, 1, -1)

20) (1, -1, 1, 0)

21) (1, -1, 1, 1)

22) (1, 0, 1, 1)

23) (1, 1, 0, -1)

24) (1, 0, 0, 0)

25) (0, 0, -1, -1)

26) (1, 1, 0, 1)

27) (1, 0, -1, 1)

28) (1, -1, 0, 0)

29) (1, 1, 0, 0)

30) (1, 1, 1, 1)

31) (1, -1, -1, -1)

32) (1, 0, 0, 0)

33) (1, 0, 0, 1)

34) (1, 1, 0, 0)

35) (1, 0, 0, 0)

36) (1, 0, 1, 1)

37) (1, 0, 0, 1)

38) (1, 1, 0, 0)

39) (-1, -1, 0, 0)

40) (0, 0, 0, 0)

41) (1, 0, 0, 1)

42) (1, 0, 0, 1)

43) (-1, -1, 0, 0)

44) (1, 0, 0, 1)

45) (1, 0, 0, 0)

46) (1, 0, 0, 0)

47) (-1, 1, 1, 1)

48) (1, 1, 1, 0)

49) (1, 1, 0, 1)

50) (1, 0, 1, 0)



In [17]:
display_reviews(text_corpus)        

1) I love the concept. I feel like in swiss traditional market. The place is amazing. The food is awesome. But, in my opinion, they need to make a change/rotation in menu or even new menu. I choose this place for lunch frequently. Sometimes I feel bored with the menu.  Overall, thanks Marche for the delicious food, also the nice place.

2) Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang yang orang2 bilang enak. Dan emang beneran enak sih nasi gorengnya wkkw suasana nya juga enak buat makan ramai2 gitu.

3) Suka sama bebek ini karna dulu d ajak tmn makan di sini, ehh malah jd ketagihan sama dagingnya yg empuk dan sambel mentah nya yg dasyatttt    Dulu tempatnya masih tenda, sekarang udh ada kiosnya, kursinya lumayan banyak ada toilet nya juga..    Kalo makan bebek ini selalu order dua bebek, nasi uduk, sate rempela, sambel mentah ekstra pedas dan es teh manis, sambel mentah nya bisa request pedasnya..

4) Very good and very delish!!! Gokils deh enaknya... Highly Recommen

In [18]:
for id, _ in enumerate(labels_corpus):
    reviews_corpus.append([labels_corpus[id], text_corpus[id]])

In [19]:
display_reviews(reviews_corpus)

1) [(1, 0, 0, 1), 'I love the concept. I feel like in swiss traditional market. The place is amazing. The food is awesome. But, in my opinion, they need to make a change/rotation in menu or even new menu. I choose this place for lunch frequently. Sometimes I feel bored with the menu.  Overall, thanks Marche for the delicious food, also the nice place.']

2) [(1, 0, 0, 1), 'Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang yang orang2 bilang enak. Dan emang beneran enak sih nasi gorengnya wkkw suasana nya juga enak buat makan ramai2 gitu.']

3) [(1, 0, 0, 0), 'Suka sama bebek ini karna dulu d ajak tmn makan di sini, ehh malah jd ketagihan sama dagingnya yg empuk dan sambel mentah nya yg dasyatttt    Dulu tempatnya masih tenda, sekarang udh ada kiosnya, kursinya lumayan banyak ada toilet nya juga..    Kalo makan bebek ini selalu order dua bebek, nasi uduk, sate rempela, sambel mentah ekstra pedas dan es teh manis, sambel mentah nya bisa request pedasnya..']

4) [(1, 0, 0, 0

#### Splitting english and indonesian training data

In [20]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

en_reviews_corpus = []
id_reviews_corpus = []
for id, review in enumerate(reviews_corpus):
    tokens = word_tokenize(review[1])
    added_vocab = ['tau', 'gue', 'saya', 'baru', 'gila', 'ga', 'paling', 'yang'] # manually add indo vocabularies
    
    if(tokens[0].lower() in english_vocab and (tokens[0].lower() not in added_vocab)):
        # print(tokens[0].lower())
        en_reviews_corpus.append([labels_corpus[id]," ".join(tokens)])
    else:
        id_reviews_corpus.append([labels_corpus[id]," ".join(tokens)])
        
print("Total training data: ", len(reviews_corpus))
print("English reviews: ", len(en_reviews_corpus))
print("Indonesian reviews:", len(id_reviews_corpus))

Total training data:  50
English reviews:  22
Indonesian reviews: 28


In [21]:
en_processed_reviews = []
id_processed_reviews = []

In [22]:
display_reviews(id_reviews_corpus)

1) [(1, 0, 0, 1), 'Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang yang orang2 bilang enak . Dan emang beneran enak sih nasi gorengnya wkkw suasana nya juga enak buat makan ramai2 gitu .']

2) [(1, 0, 0, 0), 'Suka sama bebek ini karna dulu d ajak tmn makan di sini , ehh malah jd ketagihan sama dagingnya yg empuk dan sambel mentah nya yg dasyatttt Dulu tempatnya masih tenda , sekarang udh ada kiosnya , kursinya lumayan banyak ada toilet nya juga.. Kalo makan bebek ini selalu order dua bebek , nasi uduk , sate rempela , sambel mentah ekstra pedas dan es teh manis , sambel mentah nya bisa request pedasnya..']

3) [(1, 1, 0, 0), "Tempat dessert kelapa yang fresh banget ! Kalo kesini paling suka beli coco pouchnya sambil ngobrol '' '' sama temen '' '' .. Harga nya bersahabat banget , dan lumayan banyak isi coco pouch nya.. Paling suka coco pouch rasa honeydew asanya seger bangettt~ bener '' '' energy potion !"]

4) [(1, 0, 0, 0), "Tryin menya sakura ramen for the first time 

#### Splitting both labels

In [23]:
en_labels_corpus = []
id_labels_corpus = []

In [24]:
for corpus in en_reviews_corpus:
    en_labels_corpus.append(corpus[0])
    del corpus[0]

In [25]:
for corpus in id_reviews_corpus:
    id_labels_corpus.append(corpus[0])
    del corpus[0]

## Text Preprocessing

Since, text is the most unstructured form of all the available data, various types of noise are present in it and the data is not readily analyzable without any pre-processing. The entire process of cleaning and standardization of text, making it noise-free and ready for analysis is known as text preprocessing.

It is predominantly comprised of three steps:

1. Noise Removal
2. Lexicon Normalization
3. Object Standardization

#### 1. Noise removal

Any piece of text which is not relevant to the context of the data and the end-output can be specified as the noise. For example – language stopwords (commonly used words of a language – is, am, the, of, in etc), URLs or links, social media entities (mentions, hashtags), punctuations and industry specific words. This step deals with removal of all types of noisy entities present in the text.

In [26]:
# en stopwords
en_stopwords = stopwords.words('english')

# id stopwords
factory = StopWordRemoverFactory()
id_stopwords_remover = factory.create_stop_word_remover()

##### English

In [27]:
for id, review in enumerate(en_reviews_corpus):
    review = "".join(review)
    tokens = word_tokenize(review)
    
    review_list = [i.lower() for i in tokens if i not in en_stopwords]
    review_arr = " ".join(review_list)
    en_processed_reviews.append(review_arr)

In [28]:
# display_reviews(en_processed_reviews)

##### Indo

In [29]:
for id, review in enumerate(id_reviews_corpus):
    review = "".join(review)
    id_processed_reviews.append(id_stopwords_remover.remove(review))

In [31]:
# display_reviews(id_processed_reviews)

#### 2. Lexicon Normalization

Another type of textual noise is about the multiple representations exhibited by single word. For example – “play”, “player”, “played”, “plays” and “playing” are the different variations of the word – “play”, Though they mean different but contextually all are similar. The step converts all the disparities of a word into their normalized form (also known as lemma). Normalization is a pivotal step for feature engineering with text as it converts the high dimensional features (N different features) to the low dimensional space (1 feature), which is an ideal ask for any ML model.

The most common lexicon normalization practices are :

1. Stemming:  Stemming is a rudimentary rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word.
2. Lemmatization: Lemmatization, on the other hand, is an organized & step by step procedure of obtaining the root form of the word, it makes use of vocabulary (dictionary importance of words) and morphological analysis (word structure and grammar relations).

#### English

In [32]:
joint_review = []
enStemmer = EnglishStemmer()

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    print("Input: %s\n"%review)
    joint_token_str = enStemmer.stem(review)
    joint_token_str = review
    print("Output: %s\n"%joint_token_str)

    joint_review.append(joint_token_str)

en_processed_reviews = joint_review

Input: i love concept . i feel like swiss traditional market . the place amazing . the food awesome . but , opinion , need make change/rotation menu even new menu . i choose place lunch frequently . sometimes i feel bored menu . overall , thanks marche delicious food , also nice place .

Output: i love concept . i feel like swiss traditional market . the place amazing . the food awesome . but , opinion , need make change/rotation menu even new menu . i choose place lunch frequently . sometimes i feel bored menu . overall , thanks marche delicious food , also nice place .

Input: very good delish ! ! ! gokils deh enaknya ... highly recommended . gyutan - semur iga sapi - ayam panggang good desserts also good..

Output: very good delish ! ! ! gokils deh enaknya ... highly recommended . gyutan - semur iga sapi - ayam panggang good desserts also good..

Input: best place date someone . good ambiance nice interior . decent price ( used ) best hamburger favorite alfredo carbonara prince hous

#### Indo

In [33]:
joint_review = []
idStemmer = IndonesianStemmer()

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    print("Input: %s\n"%review)
    joint_token_str = idStemmer.stem(review)
    joint_token_str = review
    print("Output: %s\n"%joint_token_str)

    joint_review.append(joint_token_str)

id_processed_reviews = joint_review

Input: Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang orang2 bilang enak . Dan emang beneran enak sih nasi gorengnya wkkw suasana nya enak buat makan ramai2 gitu .

Output: Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang orang2 bilang enak . Dan emang beneran enak sih nasi gorengnya wkkw suasana nya enak buat makan ramai2 gitu .

Input: Suka sama bebek karna dulu d ajak tmn makan sini , ehh malah jd ketagihan sama dagingnya yg empuk sambel mentah nya yg dasyatttt Dulu tempatnya tenda , sekarang udh kiosnya , kursinya lumayan banyak toilet nya juga.. Kalo makan bebek selalu order bebek , nasi uduk , sate rempela , sambel mentah ekstra pedas es teh manis , sambel mentah nya request pedasnya..

Output: Suka sama bebek karna dulu d ajak tmn makan sini , ehh malah jd ketagihan sama dagingnya yg empuk sambel mentah nya yg dasyatttt Dulu tempatnya tenda , sekarang udh kiosnya , kursinya lumayan banyak toilet nya juga.. Kalo makan bebek selalu order bebek , nas

#### 3. Object Standardization

Text data often contains words or phrases which are not present in any standard lexical dictionaries. These pieces are not recognized by search engines and models.

Some of the examples are – acronyms, hashtags with attached words, and colloquial slangs. With the help of regular expressions and manually prepared data dictionaries, this type of noise can be fixed, the code below uses a dictionary lookup method to replace social media slangs from a text.

1) Handling Appostrophes

To avoid any word sense disambiguation in text, it is recommended to maintain proper structure in it and to abide by the rules of context free grammar. When apostrophes are used, chances of disambiguation increases.
For example “it’s is a contraction for it is or it has”. All the apostrophes should be converted into standard lexicons.

##### English

In [34]:
appostrophes_dict = {"'s": "is", "'re": "are", "'m": "am", "'ve": "ve", "'d": "would", "'ll": "will", "'t": "ot", "nt": "not"}

In [35]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    tokens = word_tokenize(review)
    
    joint_token = [appostrophes_dict[token] if token in appostrophes_dict else token for token in tokens]
    joint_token_str = " ".join(joint_token)
    
    joint_review.append(joint_token_str)
    
en_processed_reviews = joint_review

In [36]:
# display_reviews(en_processed_reviews)

##### Indo

Nothing. There is no appostrophes behaviour in indonesian language

2) Removal of Punctuations

All the punctuation marks according to the priorities should be dealt with. For example: “.”, “,”,”?” are important punctuations that should be retained while others need to be removed.

##### English

In [37]:
punctuations_dict = '''!()-[]{};:'"\,<>./@#$%^&*_~''' # not includes !(),.?

In [38]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    processed_review = ""
    for token in review:
        if token not in punctuations_dict:
            processed_review = processed_review + token

    joint_review.append(processed_review)

en_processed_reviews = joint_review

In [39]:
# display_reviews(en_processed_reviews)

##### Indo

In [40]:
punctuations_dict = '''!()-[]{};:'"\,<>./@#$%^&*_~''' # not includes !(),.?

In [41]:
joint_review = []

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    processed_review = ""
    for token in review:
        if token not in punctuations_dict:
            processed_review = processed_review + token

    joint_review.append(processed_review)

id_processed_reviews = joint_review

In [42]:
# display_reviews(id_processed_reviews)

3) Removal of whitespace noise

There is a need to remove unneeded whitespace in a sentences like "because it is right  .", "Besides  , there is..."

##### English

In [43]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    processed_review = review.lower()
    processed_review = processed_review.replace(" .", ".")
    processed_review = processed_review.replace(" ,", ".")
    processed_review = processed_review.replace("  ", " ")
    processed_review = processed_review.replace("   ", " ")
    processed_review = processed_review.replace("    ", " ")
    
    joint_review.append(processed_review)

en_processed_reviews = joint_review

In [44]:
# display_reviews(en_processed_reviews)

##### Indo

In [45]:
joint_review = []

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    processed_review = review.replace(" .", ".")
    processed_review = processed_review.replace(" ,", ".")
    processed_review = processed_review.replace("  ", " ")
    processed_review = processed_review.replace("   ", " ")
    processed_review = processed_review.replace("    ", " ")
    
    joint_review.append(processed_review)

id_processed_reviews = joint_review

In [46]:
# display_reviews(id_processed_reviews)

4) Standardizing words

Sometimes words are not in proper formats. For example: “I looooveee you” should be “I love you”. Simple rules and regular expressions can help solve these cases. Also, remove emoji content.

##### English

In [47]:
joint_review = []

for id, review in enumerate(en_processed_reviews):
    review = "".join(review)
    review = ''.join(''.join(s)[:2] for _, s in itertools.groupby(review))
    tokens = word_tokenize(review)
    
    joint_token = []
    for token in tokens:
        if token not in english_vocab:
            token = ''.join(''.join(s)[:1] for _, s in itertools.groupby(token))
    
        joint_token.append(token)
    
    joint_token_str = " ".join(joint_token)
    
    # remove emoji content
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

    joint_token_str = emoji_pattern.sub(r'', joint_token_str)
    
    joint_review.append(joint_token_str)

en_processed_reviews = joint_review

In [48]:
# display_reviews(en_processed_reviews)

##### Indo

In [49]:
joint_review = []

for id, review in enumerate(id_processed_reviews):
    review = "".join(review)
    review = ''.join(''.join(s)[:2] for _, s in itertools.groupby(review))
    tokens = word_tokenize(review)
    
    joint_token = []
    for token in tokens:
        if token not in english_vocab:
            token = ''.join(''.join(s)[:1] for _, s in itertools.groupby(token))
    
        joint_token.append(token)
    
    joint_token_str = " ".join(joint_token)   
    
    # remove emoji content
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

    joint_token_str = emoji_pattern.sub(r'', joint_token_str)
    
    joint_review.append(joint_token_str)

id_processed_reviews = joint_review

In [50]:
# display_reviews(id_processed_reviews)

#### Display both training data

In [51]:
# uncomment this to display the last processed text for english
display_reviews(en_processed_reviews)

1) i love concept i feel like swiss traditional market the place amazing the food awesome but opinion need make changerotation menu even new menu i choose place lunch frequently sometimes i feel bored menu overall thanks marche delicious food also nice place

2) very good delish gokils deh enaknya highly recomended gyutan semur iga sapi ayam pangang good deserts also good

3) best place date someone good ambiance nice interior decent price used best hamburger favorite alfredo carbonara prince house seharusnya saya rate 50 its affordable waffle longer price worth anymore

4) cheese cake nya juara lembut tempatnya enak cozy parkiranya pun lumayan luas

5) been twice waktu itu gue ke sini pas udah rada malem as know wargih ini selalu rame gapernah sepi

6) great concept relatable name marche the food varieties arent many i always come roasted chicken price nt high good interior

7) best bingsu yg pernah gue coba paling mirip sama aslinya yg di korea pesen patbingsu shaved icenya kerasa bg

In [52]:
# uncomment this to display the last processed text for indonesian
display_reviews(id_processed_reviews)

1) Sengaja macet2an kesini cuman buat nyobain nasi goreng cakalang orang2 bilang enak Dan emang beneran enak sih nasi gorengnya wkw suasana nya enak buat makan ramai2 gitu

2) Suka sama bebek karna dulu d ajak tmn makan sini eh malah jd ketagihan sama dagingnya yg empuk sambel mentah nya yg dasyat Dulu tempatnya tenda sekarang udh kiosnya kursinya lumayan banyak toilet nya juga Kalo makan bebek selalu order bebek nasi uduk sate rempela sambel mentah ekstra pedas es teh manis sambel mentah nya request pedasnya

3) Tempat dessert kelapa fresh banget Kalo kesini paling suka beli coco pouchnya ngobrol sama temen Harga nya bersahabat banget lumayan banyak isi coco pouch nya Paling suka coco pouch rasa honeydew asanya seger banget bener energy potion

4) Tryin menya sakura ramen for the first time with bunch of my friends First we re bit doubting but the sign in front of resto quite big sayin that ` Japan no 1 Ramen so we decided to give it a try Most of us tryin the tonkotsu ramen the spicy

In [53]:
len(en_labels_corpus), len(en_processed_reviews), len(id_labels_corpus), len(id_processed_reviews)

(22, 22, 28, 28)

In [124]:
processed_reviews = en_processed_reviews + id_processed_reviews

#### Train data

In [126]:
food = []
price = []
service = []
ambience = []
for id, label in enumerate(en_labels_corpus):
    food.append(label[0])
    price.append(label[1])
    service.append(label[2])
    ambience.append(label[3])
    
train_reviews = {'food': food, 'price': price, 'service': service, 'ambience': ambience, 'review': processed_reviews}

In [55]:
df_review = pd.DataFrame(data=train_reviews)    
df_review = df_review[['review', 'food', 'price', 'service', 'ambience']]
df_review.head()

Unnamed: 0,review,food,price,service,ambience
0,i love concept i feel like swiss traditional m...,1,0,0,1
1,very good delish gokils deh enaknya highly rec...,1,0,0,0
2,best place date someone good ambiance nice int...,1,-1,0,1
3,cheese cake nya juara lembut tempatnya enak co...,1,0,0,1
4,been twice waktu itu gue ke sini pas udah rada...,1,0,0,0


In [56]:
train_text = df_review['review']
train_labels = df_review[['food', 'price', 'service', 'ambience']]

#### test data

In [62]:
food = []
price = []
service = []
ambience = []
for id, label in enumerate(test_labels_corpus):
    food.append(label[0])
    price.append(label[1])
    service.append(label[2])
    ambience.append(label[3])
    
test_reviews = {'food': food, 'price': price, 'service': service, 'ambience': ambience, 'review': test_text_corpus}

In [95]:
df_review_test = pd.DataFrame(data=test_reviews)
df_review_test = df_review_test[['review', 'food', 'price', 'service', 'ambience']]
df_review_test.fillna(0, inplace=True)
df_review_test.head()

Unnamed: 0,review,food,price,service,ambience
0,Iseng banget kesini sama temen karena udah la...,1.0,0.0,1.0,1.0
1,Ke patbingsoo karena pengen coba sojunya . Pe...,0.0,-1.0,0.0,0.0
2,Restoran babi guling a la Bali ini cukup meny...,1.0,0.0,0.0,0.0
3,Cafe dengan konsep unik menjual segala jenis ...,1.0,0.0,1.0,1.0
4,Kalo ke BSD wajib kesini . Ada di lantai 2 br...,1.0,0.0,-1.0,1.0


In [108]:
test_text = df_review_test['review']
test_labels = df_review_test[['food', 'price', 'service', 'ambience']]

## Text to Features

**Text to Features (Feature Engineering on text data)**

1) Syntactical Parsing
- Dependency Grammar
- Part of Speech Tagging

2) Entity Parsing
- Phrase Detection
- Named Entity Recognition
- Topic Modelling
- N-Grams

3) Statistical features
- TF – IDF
- Frequency / Density Features
- Readability Features

4) Word Embeddings

## Text Classification

#### Support Vector Machine

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [110]:
# create feature vectors 
vectorizer = TfidfVectorizer(min_df=4, max_df=0.9)

# apply train feature vectors
train_vectors = vectorizer.fit_transform(train_text)

# apply test feature vectors
test_vectors = vectorizer.transform(test_text)

In [120]:
svm_classifier = svm.SVC(kernel='linear') 
svm_classifier.fit(train_vectors, train_labels['food']) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [121]:
prediction = svm_classifier.predict(test_vectors)

In [122]:
print("Test Accuracy: ", accuracy_score(test_labels['food'], prediction)* 100)

Test Accuracy:  62.0


#### Naive Bayes Classifier

In [None]:
from textblob.classifiers import NaiveBayesClassifier as NBC
from textblob import TextBlob

training_corpus = []
for id, en_review in enumerate(en_processed_reviews):
    training_corpus.append((en_review, en_labels_corpus[id]))

test_corpus = []

nb_classifier = NBC(training_corpus)

In [None]:
print(('food', 'price', 'service', 'ambience'))
print(nb_classifier.classify("enjoy place fatty tho berry nic dessert myriad text"))

## Testing model

In [59]:
tree = et.parse('./datasets/validation_set.xml')
root = tree.getroot()

In [60]:
test_reviews_corpus = []
test_labels_corpus = []
test_text_corpus = []

# grab all XML contents
for review in root.findall('review'):
    rid = review.get('rid')
    text = review.find('text').text 
    
    label = ""
    for aspects in review.findall('aspects'):
        id = aspects.get('id')
        
        food, price, service, ambience = None, None, None, None
        
        label = (food, price, service, ambience)
        for aspect in aspects.findall('aspect'):
            category = aspect.get('category')
            polarity = aspect.get('polarity')

            if category == "FOOD":
                if polarity == 'POSITIVE': food = 1
                else: food = -1
            elif category == "PRICE":
                if polarity == 'POSITIVE': price = 1
                else: price = -1
            elif category == "SERVICE":
                if polarity == 'POSITIVE': service = 1
                else: service = -1
            elif category == "AMBIENCE":
                if polarity == 'POSITIVE': ambience = 1
                else: ambience = -1

            label = (food, price, service, ambience)

        test_labels_corpus.append(label)           
            
    test_text_corpus.append(text)

In [61]:
for id, _ in enumerate(test_labels_corpus):
    test_reviews_corpus.append([test_labels_corpus[id], test_text_corpus[id]])

In [None]:
# display_reviews(test_labels_corpus)