In [1]:
# Step 1: Importing the libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from ClassificationModels import test

In [2]:
# Step 2: Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv',
                      delimiter = '\t',         # Due to tsv file thus separated by tab
                      quoting = 3)              # No quoting allowed

In [3]:
# Step 3: Cleaning the texts
def cleaning_text(dataSet):
    import re
    import nltk
    nltk.download('stopwords')        
    from nltk.corpus import stopwords 
    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer() 

    corpus = list()    
    for comment in range(dataset['Review'].size):
        review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][comment])   
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

# Step 4: Tokenization
from sklearn.feature_extraction.text import CountVectorizer # class for tokenization
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(cleaning_text(dataset)).toarray() 
y = dataset.iloc[:, 1].values 

[nltk_data] Downloading package stopwords to /home/hamza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Step 5: Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,   # dependent, independent variables
                                                    test_size = 0.25, # 25% test size
                                                    random_state = 0) # due to this our trained


In [5]:
test(X_train, X_test, y_train, y_test,
                          naive_bayes=True,
                          decision_tree=True,
                          random_forest=True,
                          SVM=True,
                          KNN=True,
                          logestic_regression=True)


Model: Naive Bayes Result
------------------------------------

True Negative: 67
True Positive: 113
False Negative: 20
False Positive: 50

Accuracy (Difference of acutal & predicted values): 53.6
Precision (measuring exactness): 0.6932515337423313
Recall (measuring completeness): 0.849624060150376
F1 Score(compromise between Precision and Recall): 0.7635135135135135


Model: Decision Tree Result
------------------------------------

True Negative: 91
True Positive: 90
False Negative: 43
False Positive: 26

Accuracy (Difference of acutal & predicted values): 72.8
Precision (measuring exactness): 0.7758620689655172
Recall (measuring completeness): 0.6766917293233082
F1 Score(compromise between Precision and Recall): 0.7228915662650602


Model: Random Forest Result
------------------------------------

True Negative: 95
True Positive: 69
False Negative: 64
False Positive: 22

Accuracy (Difference of acutal & predicted values): 76.0
Precision (measuring exactness): 0.7582417582417582
Rec