# Import necessary libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [4]:
# Read the dataset
dataset = pd.read_csv('Restaurant_Reviews 2.csv')
dataset.head()  # Display the first few rows of the dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Data preprocessing

In [6]:

import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Initialize corpus to store processed reviews
corpus = []

# Loop through each review in the dataset
for i in range(0, len(dataset)):
    # Clean the text
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    all_stopwords.remove('no')
    all_stopwords.remove('but')
    all_stopwords.remove("won't")
    review = [ps.stem(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pritamthakulakshetri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Feature extraction using Bag of Words

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset['Liked'


In [8]:

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Model selection and training

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


models = [
    ('Logistic Regression', LogisticRegression(C=1.)),
    ('Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC(C=1., kernel='rbf')),
    ('Random Forest', RandomForestClassifier())
]


In [11]:

# Train and evaluate each model
for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} accuracy: {accuracy}')


Logistic Regression accuracy: 0.78
Naive Bayes accuracy: 0.8
Support Vector Machine accuracy: 0.775
Random Forest accuracy: 0.77


In [12]:

# Model selection using cross-validation
best_model = models[0][1]
best_accuracy = 0

for model_name, model in models:
    accuracy_scores = cross_val_score(model, X_train, y_train, cv=10)
    mean_accuracy = np.mean(accuracy_scores)
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_model = model_name

print(f'Best model: {best_model}')
print(f'Accuracy with k-fold cross-validation: {best_accuracy}')


Best model: Logistic Regression
Accuracy with k-fold cross-validation: 0.80375



# Save the best model and vectorizer

In [13]:

import joblib

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

joblib.dump(classifier, 'logistic_regression_NLPreviews.joblib')
joblib.dump(cv, 'vectorizer_reviews.joblib')

# Load the saved model and vectorizer
loaded_model = joblib.load('logistic_regression_NLPreviews.joblib')
loaded_vectorizer = joblib.load('vectorizer_reviews.joblib')

# Make predictions on new reviews
new_review = loaded_vectorizer.transform(['I like the food'])
predictions = loaded_model.predict(new_review)

# Display sentiment based on predictions
if predictions[0] == 0:
    print('negative sentiment')
else:
    print('positive sentiment')


negative sentiment
