In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muzammil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data = pd.read_csv("NLP-food-review.csv")

In [9]:
def preprocess(text):
    text = re.sub( '[^A-Za-z ]',"",text)
    text = text.lower()
    text = [ PorterStemmer().stem(word) for word in text.split() if word not in stopwords.words("english") ]
    return " ".join(text)

In [10]:
data

Unnamed: 0,review,reaction
0,Service is friendly and inviting.,1
1,Awesome service and food.,1
2,Waitress was a little slow in service.,0
3,"Come hungry, leave happy and stuffed!",1
4,Horrible - don't waste your time and money.,0
...,...,...
995,This was my first time and I can't wait until ...,1
996,Great service and food.,1
997,I paid the bill but did not tip because I felt...,0
998,The one down note is the ventilation could use...,0


In [11]:
data['review'] = data['review'].apply(preprocess)

##text to numeric

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()
features = cv.fit_transform(data['review']).toarray()
pf = pd.DataFrame(features,columns = cv.get_feature_names_out())

In [22]:
##Model building
from sklearn.metrics import classification_report

In [23]:
x_train,x_test,y_train,y_test = train_test_split(pf,data.reaction,test_size=0.2,random_state=1)

In [24]:
models = [DecisionTreeClassifier(),LogisticRegression()]

for model in models:
    model.fit(x_train,y_train)
    print(model,":::")
    print(classification_report(y_test,model.predict(x_test)))

DecisionTreeClassifier() :::
              precision    recall  f1-score   support

           0       0.66      0.79      0.72        85
           1       0.82      0.70      0.76       115

    accuracy                           0.74       200
   macro avg       0.74      0.75      0.74       200
weighted avg       0.75      0.74      0.74       200

LogisticRegression() :::
              precision    recall  f1-score   support

           0       0.73      0.75      0.74        85
           1       0.81      0.79      0.80       115

    accuracy                           0.78       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.78      0.78      0.78       200



In [25]:
#inference

In [26]:
new_text = "The pizza is filthy"
preprocess_text = preprocess(new_text)
preprocess_vector = cv.transform([preprocess_text]).toarray()
model.predict(preprocess_vector)



array([1], dtype=int64)