In [5]:
import pandas as pd
import numpy as np
import re


**Choosing File for Analysis**

In [6]:
from google.colab import files
uploaded = files.upload()

In [7]:
reviews0 = pd.read_csv("/content/Zomato_reviews.csv", encoding= 'unicode_escape')

In [8]:
reviews0.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [9]:
reviews0.describe(include="all")

Unnamed: 0,rating,review_text
count,27762.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665784,
std,1.284573,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


In [10]:
reviews1 = reviews0[~reviews0.review_text.isnull()].copy()
reviews1.reset_index(inplace=True, drop=True)

In [11]:
reviews0.shape, reviews1.shape

((27762, 2), (27748, 2))

## **Converting to list for easy manipulation**


In [12]:
reviews_list = reviews1.review_text.values

In [13]:
len(reviews_list)

27748

Text clean up

Normalize the case

Remove stop words

         remove "not", "no" from the stop word list

Remove punctuations

Normalizing case 

**Stop words are a set of commonly used words in any language. For example, in English, “the”, “is” and “and”, would easily qualify as stop words. In NLP and text mining applications, stop words are used to eliminate unimportant words, allowing applications to focus on the important words instead.**

In [14]:
reviews_lower = [txt.lower() for txt in reviews_list]

In [15]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly better cooked.\r\ntried 2 beverages, both were very sweet.']

In [16]:
reviews_lower = [" ".join(txt.split()) for txt in reviews_lower]

In [17]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better cooked. tried 2 beverages, both were very sweet.']

## **Tokenize**

**Tokenization is used in natural language processing to split paragraphs and sentences into smaller units that can be more easily assigned meaning. The first step of the NLP process is gathering the data (a sentence) and breaking it into understandable parts (words)**

In [21]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [22]:
from nltk.tokenize import word_tokenize

In [23]:
print(word_tokenize(reviews_lower[0]))

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [None]:
import nltk
nltk.download('punkt')

In [25]:
reviews_tokens = [word_tokenize(sent) for sent in reviews_lower]
print(reviews_tokens[0])


['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


## **Remove stop words and punctuations**

In [26]:
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [27]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [28]:
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [29]:
stop_nltk.remove("no")
stop_nltk.remove("not")
stop_nltk.remove("don")
stop_nltk.remove("won")

In [30]:
"no" in stop_nltk

False

In [31]:
stop_final = stop_nltk + stop_punct + ["...", "``","''", "====", "must"]


In [32]:
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [33]:
del_stop(reviews_tokens[1])

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [34]:
reviews_clean = [del_stop(sent) for sent in reviews_tokens]

In [35]:
reviews_clean = [" ".join(sent) for sent in reviews_clean]
reviews_clean[:2]

['service worst pricing menu different bill give bill increased pricing even serving water menu order need call 3-4 times even non busy day',
 "really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada try"]

## **Separate X and Y and perform train test split, 70-30**

In [36]:
len(reviews_clean)

27748

In [37]:
X = reviews_clean
y = reviews1.rating

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)


## **Document term matrix using TfIdf**

**TF-IDF stands for term frequency-inverse document frequency and it is a measure, used in the fields of information retrieval (IR) and machine learning, that can quantify the importance or relevance of string representations (words, phrases, lemmas, etc) in a document amongst a collection of documents**

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
vectorizer = TfidfVectorizer(max_features = 5000)

In [41]:
len(X_train), len(X_test)

(19423, 8325)

In [42]:
X_train_bow = vectorizer.fit_transform(X_train)

In [43]:
X_test_bow = vectorizer.transform(X_test)

In [44]:
X_train_bow.shape, X_test_bow.shape

((19423, 5000), (8325, 5000))

## **Model building**

**Random Forest Regression is a supervised learning algorithm that uses ensemble learning method for regression. Ensemble learning method is a technique that combines predictions from multiple machine learning algorithms to make a more accurate prediction than a single model.**

**Gradient boosting Regression calculates the difference between the current prediction and the known correct target value. This difference is called residual. After that Gradient boosting Regression trains a weak model that maps features to that residual.**

In [45]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


In [46]:
?RandomForestRegressor

In [47]:
learner_rf = RandomForestRegressor(random_state=42)

In [48]:
learner_rf.fit(X_train_bow, y_train)

RandomForestRegressor(random_state=42)

In [49]:
y_train_preds = learner_rf.predict(X_train_bow)

In [50]:
from sklearn.metrics import mean_squared_error

In [51]:
mean_squared_error(y_train, y_train_preds)**0.5

0.23684233164605095

## **Increasing the number of trees**

In [52]:
learner_rf = RandomForestRegressor(random_state=42, n_estimators=20)


In [53]:
%%time
learner_rf.fit(X_train_bow, y_train)

CPU times: user 1min 28s, sys: 77.1 ms, total: 1min 28s
Wall time: 1min 28s


RandomForestRegressor(n_estimators=20, random_state=42)

In [54]:
y_train_preds = learner_rf.predict(X_train_bow)

In [55]:
mean_squared_error(y_train, y_train_preds)**0.5

0.2504450545985135

## **Hyper-parameter tuning**

**GridSearchCV is a technique for finding the optimal parameter values from a given set of parameters in a grid. It's essentially a cross-validation technique. The model as well as the parameters must be entered. After extracting the best parameter values, predictions are made.**

In [56]:
from sklearn.model_selection import GridSearchCV

In [57]:
#?RandomForestRegressor

In [58]:
learner_rf = RandomForestRegressor(random_state=42)

In [59]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_features': [500, "sqrt", "log2", "auto"],
    'max_depth': [10, 15, 20]
}

In [60]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = learner_rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 1, scoring = "neg_mean_squared_error" )


In [61]:
grid_search.fit(X_train_bow, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20],
                         'max_features': [500, 'sqrt', 'log2', 'auto']},
             scoring='neg_mean_squared_error', verbose=1)

In [66]:
from sklearn.model_selection import GridSearchCV

In [68]:
grid_search.cv_results_

{'mean_fit_time': array([  6.67649593,   1.23929586,   0.58996701,  56.9075305 ,
         13.53023577,   2.0161046 ,   0.7875577 ,  96.00638661,
         21.46116977,   3.15537758,   1.10550203, 127.41266508]),
 'std_fit_time': array([5.95140005e-02, 8.05844899e-03, 1.12614905e-02, 7.99941514e-01,
        6.62394026e-01, 3.40037283e-02, 1.13028329e-02, 1.27036633e+00,
        1.26310347e+00, 1.31766567e-01, 1.19956570e-02, 1.82852337e+01]),
 'mean_score_time': array([0.08005643, 0.07539454, 0.07441654, 0.08376241, 0.10951219,
        0.08794513, 0.07920198, 0.1132534 , 0.11121254, 0.1031702 ,
        0.10280061, 0.10157123]),
 'std_score_time': array([0.00252836, 0.00255889, 0.00213693, 0.00672914, 0.03346465,
        0.00540786, 0.00099286, 0.03329114, 0.00723074, 0.00862227,
        0.01633694, 0.01081882]),
 'param_max_depth': masked_array(data=[10, 10, 10, 10, 15, 15, 15, 15, 20, 20, 20, 20],
              mask=[False, False, False, False, False, False, False, False,
              

In [65]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=20, max_features=500, random_state=42)

In [69]:
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [70]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)


In [None]:
mean_squared_error(y_train, y_train_pred)**0.5

In [72]:
mean_squared_error(y_test, y_test_pred)**0.5

0.7272534467721806

In [73]:
res_df = pd.DataFrame({'review':X_test, 'rating':y_test, 'rating_pred':y_test_pred})

In [74]:
res_df[(res_df.rating - res_df.rating_pred)>=2].shape

(6, 3)

In [75]:
res_df[(res_df.rating - res_df.rating_pred)>=2]

Unnamed: 0,review,rating,rating_pred
7277,life saviours serving excellent food worst tim...,5.0,2.150063
1818,value money ordered second time,5.0,2.96441
4771,not good,5.0,2.189819
16510,may not polished serving packaging etc never b...,5.0,1.840416
14845,oh memories place first drink bangalore almost...,5.0,2.618844
16515,may not polished serving packaging etc never b...,5.0,1.840416
