In [4]:
import pandas as pd

# Load the dataset with column names
df = pd.read_csv('Yelp_Restaurant_Reviews.csv')
df.head()

Unnamed: 0,Company name,Location,Rating,Review Text
0,sidney dairy barn,sidney,5,All I can say is they have very good ice cream...
1,sidney dairy barn,sidney,4,Nice little local place for ice cream.My favor...
2,sidney dairy barn,sidney,5,A delicious treat on a hot day! Staff was very...
3,sidney dairy barn,sidney,4,This was great service and a fun crew! I got t...
4,sidney dairy barn,sidney,5,This is one of my favorite places to get ice c...


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [5]:
sentiment = []
for i in range(len(df['Rating'])):
    if df['Rating'].iloc[i] < 3:
        sentiment.append("negative")
    elif df['Rating'].iloc[i] == 3:
        sentiment.append("neutral")
    else:
        sentiment.append("positive")
df['Sentiment'] = sentiment

In [6]:
tfidfVectorizer = TfidfVectorizer(max_features=1000)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('text', tfidfVectorizer, 'Review Text'),
        ('cat', categorical_transformer, ['Company name', 'Location'])
    ])

X = df[['Company name','Location','Review Text']]
y = df['Sentiment']

In [7]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

### Building the model

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [9]:
model = make_pipeline(preprocessor, LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000))

In [10]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [11]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
#print("AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

Accuracy: 0.8544144747863964
Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.62      0.68       774
     neutral       0.56      0.20      0.29       615
    positive       0.88      0.98      0.93      4580

    accuracy                           0.85      5969
   macro avg       0.73      0.60      0.63      5969
weighted avg       0.83      0.85      0.83      5969



### Building Model - Random Forest

In [20]:
randomForestClassifierModel = make_pipeline(preprocessor, RandomForestClassifier())

In [21]:
randomForestClassifierModel.fit(X_train, y_train)
y_pred = randomForestClassifierModel.predict(X_test)

In [22]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
#print("AUC Score:", roc_auc_score(y_test, randomForestClassifierModel.predict_proba(X_test)[:,1]))

Accuracy: 0.7992963645501759
Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.25      0.38       774
     neutral       0.80      0.01      0.03       615
    positive       0.80      1.00      0.89      4580

    accuracy                           0.80      5969
   macro avg       0.80      0.42      0.43      5969
weighted avg       0.80      0.80      0.73      5969

