In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# Load the dataset
dataset = pd.read_csv('Restaurant_Reviews_Dataset.tsv', sep='\t')

# Preprocess the text data
nltk.download('stopwords')
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Split the dataset into training and testing sets
X = np.array(corpus)
y = dataset.iloc[:, 1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize the text using CountVectorizer
cv = CountVectorizer(max_features=1500)
X_train_cv = cv.fit_transform(X_train).toarray()
X_test_cv = cv.transform(X_test).toarray()

# Train a k-Nearest Neighbors classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_cv, y_train)

# Predict on the test data
y_pred_knn = knn_classifier.predict(X_test_cv)

# Calculate evaluation metrics
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)

# Train a Naive Bayes classifier
naive_bayes_classifier = MultinomialNB(alpha=0.1)
naive_bayes_classifier.fit(X_train_cv, y_train)

# Predict on the test data
y_pred_nb = naive_bayes_classifier.predict(X_test_cv)

# Calculate evaluation metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)

# Print evaluation metrics and confusion matrices
print("k-Nearest Neighbors Classifier:")
print("Accuracy:", round(accuracy_knn * 100, 2), "%")
print("Precision:", round(precision_knn, 2))
print("Recall:", round(recall_knn, 2))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_knn))

print("\nNaive Bayes Classifier:")
print("Accuracy:", round(accuracy_nb * 100, 2), "%")
print("Precision:", round(precision_nb, 2))
print("Recall:", round(recall_nb, 2))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\04raj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


k-Nearest Neighbors Classifier:
Accuracy: 64.33 %
Precision: 0.68
Recall: 0.51
Confusion Matrix:
[[117  35]
 [ 72  76]]

Naive Bayes Classifier:
Accuracy: 77.67 %
Precision: 0.78
Recall: 0.77
Confusion Matrix:
[[119  33]
 [ 34 114]]
