In [81]:
import os, json, gzip 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
from nltk.corpus import wordnet, stopwords

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/user/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/user/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /Users/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Data Preparation

In [82]:
df = pd.read_csv('datasets/final_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,name,city,cuisine_style,ranking,rating,price_range,number_of_reviews,reviews,restaurant_url,restaurant_id
0,0,McDonald's,Zurich,['American'],1540.0,2.5,$,11,"['Hangover', 'Most expensive McDonalds in the ...",https://www.tripadvisor.com/Restaurant_Review-...,d8796498
1,2,Yumi Hana,Zurich,"['Korean', 'Japanese']",1542.0,3.0,$$ - $$$,50,['What once was a 5 points Restaurant is clo.....,https://www.tripadvisor.com/Restaurant_Review-...,d2070374
2,3,Peking Garden,Zurich,"['Chinese', 'Asian']",1544.0,2.5,$,41,"['Just if no other choice!', 'Ok Chinese food']",https://www.tripadvisor.com/Restaurant_Review-...,d2005395
3,4,Long Huang,Zurich,"['Chinese', 'Vietnamese']",1545.0,2.5,$$ - $$$,14,"['Good', 'As the name implies, long wait and b...",https://www.tripadvisor.com/Restaurant_Review-...,d11963037
4,5,Restaurant Hallo,Zurich,"['European', 'Cafe']",1547.0,3.0,$$ - $$$,21,"['Really just ok', 'BRASILEIROS NÃO SÃO BEM-VI...",https://www.tripadvisor.com/Restaurant_Review-...,d7195130


In [83]:
df.drop(['Unnamed: 0','name', 'city', 'cuisine_style', 'ranking', 'price_range', 'number_of_reviews', 'restaurant_url'], axis=1)

Unnamed: 0,rating,reviews,restaurant_id
0,2.5,"['Hangover', 'Most expensive McDonalds in the ...",d8796498
1,3.0,['What once was a 5 points Restaurant is clo.....,d2070374
2,2.5,"['Just if no other choice!', 'Ok Chinese food']",d2005395
3,2.5,"['Good', 'As the name implies, long wait and b...",d11963037
4,3.0,"['Really just ok', 'BRASILEIROS NÃO SÃO BEM-VI...",d7195130
...,...,...,...
71590,2.5,"['Good food and nice experience', 'Disappointi...",d697907
71591,3.5,"['Good service', 'nice atmoshphere']",d1187556
71592,2.0,"['Worst New Year’s Eve experience', 'HORRIBLE']",d939089
71593,3.0,"['Horrible!', 'It was really horrible, I would...",d1551957


## Fundamental preprocessing tasks

In [85]:
stopwords_list = stopwords.words('english')

def ReviewProcessing(df):
  # remove non alphanumeric 
  df['review_cleaned'] = df.reviews.str.replace('[^a-zA-Z0-9 ]', '')
  # lowercase
  df.review_cleaned = df.review_cleaned.str.lower()
  # split into list
  df.review_cleaned = df.review_cleaned.str.split(' ')
  # remove stopwords
  df.review_cleaned = df.review_cleaned.apply(lambda x: [item for item in x if item not in stopwords_list])
  return df

## Lemmatization

In [86]:
def get_wordnet_pos(word):
  tag = nltk.pos_tag([word])[0][1][0].upper()
  tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

  return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
def get_lemmatize(sent):
  return " ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sent)])

In [87]:
clean_data = ReviewProcessing(df)
clean_data.review_cleaned = clean_data.review_cleaned.apply(' '.join)
clean_data['review_cleaned_lemmatized'] = clean_data.review_cleaned.apply(get_lemmatize)
clean_data.head()

  df['review_cleaned'] = df.reviews.str.replace('[^a-zA-Z0-9 ]', '')


Unnamed: 0.1,Unnamed: 0,name,city,cuisine_style,ranking,rating,price_range,number_of_reviews,reviews,restaurant_url,restaurant_id,review_cleaned,review_cleaned_lemmatized
0,0,McDonald's,Zurich,['American'],1540.0,2.5,$,11,"['Hangover', 'Most expensive McDonalds in the ...",https://www.tripadvisor.com/Restaurant_Review-...,d8796498,hangover expensive mcdonalds world,hangover expensive mcdonalds world
1,2,Yumi Hana,Zurich,"['Korean', 'Japanese']",1542.0,3.0,$$ - $$$,50,['What once was a 5 points Restaurant is clo.....,https://www.tripadvisor.com/Restaurant_Review-...,d2070374,5 points restaurant clo beware owner,5 point restaurant clo beware owner
2,3,Peking Garden,Zurich,"['Chinese', 'Asian']",1544.0,2.5,$,41,"['Just if no other choice!', 'Ok Chinese food']",https://www.tripadvisor.com/Restaurant_Review-...,d2005395,choice ok chinese food,choice ok chinese food
3,4,Long Huang,Zurich,"['Chinese', 'Vietnamese']",1545.0,2.5,$$ - $$$,14,"['Good', 'As the name implies, long wait and b...",https://www.tripadvisor.com/Restaurant_Review-...,d11963037,good name implies long wait bad ser,good name implies long wait bad ser
4,5,Restaurant Hallo,Zurich,"['European', 'Cafe']",1547.0,3.0,$$ - $$$,21,"['Really just ok', 'BRASILEIROS NÃO SÃO BEM-VI...",https://www.tripadvisor.com/Restaurant_Review-...,d7195130,really ok brasileiros bemvindos,really ok brasileiros bemvindos


### TF-IDF Weighting

In [94]:
x = clean_data['review_cleaned_lemmatized']
y = clean_data['rating']
y=y.astype(int)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state = 44)

In [95]:
from sklearn.naive_bayes import MultinomialNB

nb = Pipeline([('vectorize', CountVectorizer(ngram_range=(1, 2))),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

### Linear Support Vector Machine

In [96]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier()),
               ])

### Logistic Regression Classifier

In [97]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(max_iter=500)),
               ])

In [98]:
# Naive Bayes
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print("Naive Bayes:\n")
print(accuracy_score(y_test, y_pred_nb))
print(confusion_matrix(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

# SGD Classifier
sgd.fit(X_train, y_train)
y_pred_sgd = sgd.predict(X_test)
print("SGD Classifier:\n")
print(accuracy_score(y_test, y_pred_sgd))
print(confusion_matrix(y_test, y_pred_sgd))
print(classification_report(y_test, y_pred_sgd))

# Logistic Regression
logreg.fit(X_train, y_train)
y_pred_log = logreg.predict(X_test)
print("Logistic regression:\n")
print(accuracy_score(y_test, y_pred_log))
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))

ValueError: Unknown label type: (array([4. , 4.5, 4. , ..., 4.5, 4. , 5. ]),)

In [None]:
from sklearn.model_selection import GridSearchCV

grid=[{'clf__solver': ['lbfgs', 'sag', 'saga'],
       'clf__C': [0.01, 0.1, 1]}]
lr = GridSearchCV(logreg, param_grid = grid, cv = 5, scoring='accuracy', verbose = 1, n_jobs = -1)
best_model = lr.fit(X_train, y_train)

print(best_model.best_estimator_)
print(best_model.best_score_)

y_pred_grid = best_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_grid))
print(classification_report(y_test, y_pred_grid))
print(accuracy_score(y_test, y_pred_grid))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Pipeline(steps=[('vect', CountVectorizer(ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(C=1, max_iter=500, solver='saga'))])
0.7097039209781292
[[   0    2   16   11    0]
 [   0    2  151  140    0]
 [   0    6  862 2558    2]
 [   0    5  545 9284    4]
 [   0    0    5  721    5]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        29
           2       0.13      0.01      0.01       293
           3       0.55      0.25      0.34      3428
           4       0.73      0.94      0.82      9838
           5       0.45      0.01      0.01       731

    accuracy                           0.71     14319
   macro avg       0.37      0.24      0.24     14319
weighted avg       0.66      0.71      0.65     14319

0.7090578951044068


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
