Make a model to predict the rating in a review
####**Dataset used - Zomato reviews**

**Import necessary libraries**

In [None]:
import pandas as pd
import numpy as np
import re         #import regular expression

**Load data**

In [None]:
from google.colab import files
uploaded = files.upload()

Saving Zomato_reviews.csv to Zomato_reviews.csv


In [None]:
reviews0 = pd.read_csv("Zomato_reviews.csv", encoding= 'unicode_escape')

In [None]:
reviews0.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


**Remove records where review text is null value**

In [None]:
reviews0.describe(include="all")

Unnamed: 0,rating,review_text
count,27762.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665784,
std,1.284573,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


In [None]:
reviews1 = reviews0[~reviews0.review_text.isnull()].copy()
reviews1.reset_index(inplace=True, drop=True)

In [None]:
reviews0.shape, reviews1.shape

((27762, 2), (27748, 2))

**Converting to list for easy manipulation**

In [None]:
reviews_list = reviews1.review_text.values

In [None]:
len(reviews_list)

27748

**Perform cleanup on data**

Normalizing case

In [None]:
reviews_lower = [txt.lower() for txt in reviews_list]

In [None]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly better cooked.\r\ntried 2 beverages, both were very sweet.']

Remove extra line breaks from text

In [None]:
reviews_lower = [" ".join(txt.split()) for txt in reviews_lower]

In [None]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better cooked. tried 2 beverages, both were very sweet.']

Tokenize

#### splits large sample of text into words
#### Deliberated as the foundation step for stemming and lemmatization

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
print(word_tokenize(reviews_lower[0]))

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
reviews_tokens = [word_tokenize(sent) for sent in reviews_lower]
print(reviews_tokens[0])

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


Remove stop words and punctuations

In [None]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [None]:
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Remove specified terms from stop words list

In [None]:
stop_nltk.remove("no")
stop_nltk.remove("not")
stop_nltk.remove("don")
stop_nltk.remove("won")

In [None]:
"no" in stop_nltk

False

In [None]:
stop_final = stop_nltk + stop_punct + ["...", "``","''", "====", "must"]

In [None]:
len(stop_nltk)

175

Define a function to remove stopwords from a tokenized sentence

In [None]:
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [None]:
del_stop(reviews_tokens[1])

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [None]:
reviews_clean = [del_stop(sent) for sent in reviews_tokens ]

In [None]:
reviews_clean = [" ".join(sent) for sent in reviews_clean]
reviews_clean[2:4]

['went friday night place surprisingly empty interesting menu almost fully made dosas bullseye dosa cheese masala dosa bullseye dosa really good egg perfectly cooked half boiled state masala cheese masala good cheese bit chewy liking chutney good sambar average dishes reasonably priced',
 'decent place serving good food ordered chilli fish chicken pork sizzler everything tasted good pork could slightly better cooked tried 2 beverages sweet']

seperate X and Y and perform train, test split

In [None]:
len(reviews_clean)

27748

In [None]:
X = reviews_clean
y = reviews1.rating

In [None]:
from sklearn.model_selection import train_test_split     #we are using 70-30 split here 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=32)

Document term matrix using TfIdf
#### we use TF-IDF values for the terms as feature to get into a vector space model

##### It is part of a vector consisting of a unique word which is large.
The TF-IDF is built and uses the vector to cluster the document.
Tfidfvectorizer is called the transform to normalize the tf-idf representation.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_features = 5000)    #instantiate with a maximum of 5000 terms in the vocabulary

In [None]:
len(X_train), len(X_test)    #fit and apply on train set, then apply on test set

(19423, 8325)

In [None]:
X_train_bow = vectorizer.fit_transform(X_train)

In [None]:
X_test_bow = vectorizer.transform(X_test)

In [None]:
X_train_bow.shape, X_test_bow.shape

((19423, 5000), (8325, 5000))

**Model building** - Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [None]:
?RandomForestRegressor

In [None]:
learner_rf = RandomForestRegressor(random_state=52)

In [None]:
learner_rf.fit(X_train_bow, y_train)

RandomForestRegressor(random_state=52)

Make predictions for train set

In [None]:
y_train_preds = learner_rf.predict(X_train_bow)

In [None]:
from sklearn.metrics import mean_squared_error

Model evaluation

In [None]:
mean_squared_error(y_train, y_train_preds)**0.5

0.2501895246121958

Increasing number of trees

In [None]:
learner_rf = RandomForestRegressor(random_state=42, n_estimators = 20)

In [None]:
%%time
learner_rf.fit(X_train_bow, y_train)

CPU times: user 1min 30s, sys: 118 ms, total: 1min 30s
Wall time: 1min 30s


RandomForestRegressor(n_estimators=20, random_state=42)

In [None]:
y_train_preds = learner_rf.predict(X_train_bow)

In [None]:
mean_squared_error(y_train, y_train_preds)**0.5

0.2611742983331224

Hyper-parameter tuning
##### class weights is one hyperparameter to tune for svm

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
?RandomForestRegressor

In [None]:
learner_rf = RandomForestRegressor(random_state=42)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_features': [500, "sqrt", "log2", "auto"],
    'max_depth': [10, 15, 20]
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = learner_rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )

In [None]:
grid_search.fit(X_train_bow, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [12, 15, 25],
                         'max_features': [500, 'sqrt', 'log2', 'auto']},
             scoring='neg_mean_squared_error', verbose=1)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=20, max_features=500, random_state=42)

In [None]:
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [None]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [None]:
mean_squared_error(y_train, y_train_pred)**0.5

0.6672598904578024

In [None]:
mean_squared_error(y_test, y_test_pred)**0.5

0.7150209370360346

Identify mismatch cases

In [None]:
res_df = pd.DataFrame({'review':X_test, 'rating':y_test, 'rating_pred':y_test_pred})

In [None]:
res_df[(res_df.rating - res_df.rating_pred)>=2].shape

(10, 3)

In [None]:
res_df[(res_df.rating - res_df.rating_pred)>=2]

Unnamed: 0,review,rating,rating_pred
24935,one incredible far east asian meals shizusan s...,5.0,2.184479
24577,'m not dessert person given chance explore swe...,5.0,2.994034
15201,sauce not included,4.0,1.979382
5060,yammi food fast delivered 'm loving,5.0,2.342219
26059,tried biriyani starters authentic taste spicy ...,5.0,2.575845
18600,black cup cafe lovely ambience magnificent pla...,5.0,2.997119
4893,delivered 5,5.0,2.821523
24930,one incredible far east asian meals shizusan s...,5.0,2.184479
19769,food tasty.before delivery bad 's time.im happ...,5.0,2.444245
16916,delivered time really liked food,5.0,2.994818
