In [1]:
import pandas as pd
import re

# Load the dataset with column names
df = pd.read_csv('Yelp Restaurant Reviews.csv')

# Function to remove the number from the URL if present
def clean_url(url):
    # Regular expression to find URLs ending with a dash and a number
    new_url = re.sub(r'-\d+$', '', url)
    return new_url

# Apply the function to the Url column
df['Yelp URL'] = df['Yelp URL'].apply(clean_url)

In [2]:
# Function to extract the last word from the location part of the URL
def extract_last_word(url):
    parts = url.split('/')
    location = parts[-1]  # Get the last part of the URL
    location_parts = location.split('-')  # Split the location by hyphens
    last_word = location_parts[-1]  # Get the last part, which is the word we want
    return last_word

# Apply the function to create the 'Location' column
df['Location'] = df['Yelp URL'].apply(extract_last_word)

# Specifically handle the 'las-vegas' case
df['Location'] = df['Location'].str.replace('las-vegas', 'lasvegas', regex=False)

In [3]:
# Check the current state of some of the locations
print("Before Update:", df['Location'].unique())

# Update 'vegas' to 'lasvegas' in the 'Location' column
df['Location'] = df['Location'].str.replace('^vegas$', 'las vegas', regex=True)

# Check the updated state of some of the locations
print("After Update:", df['Location'].unique())


Before Update: ['sidney' 'chandler' 'vegas' 'lakewood' 'pittsburgh' 'charlotte' 'phoenix'
 'scottsdale' 'cleveland' 'madison' 'champaign' 'urbana']
After Update: ['sidney' 'chandler' 'las vegas' 'lakewood' 'pittsburgh' 'charlotte'
 'phoenix' 'scottsdale' 'cleveland' 'madison' 'champaign' 'urbana']


In [4]:
#adding the establishment name.
def extract_establishment_name(url):
    # Split the URL by '/'
    parts = url.split('/')
    # Extract the second-to-last part
    comapanyLocation = parts[-1]
    updatedCompanyLocation = comapanyLocation.replace('las-vegas', 'vegas')
    # Split the string by '-'
    parts = updatedCompanyLocation.split('-')
    
    # Remove the last element
    parts.pop()
    
    # Join the remaining parts with spaces
    Company = ' '.join(parts)

    return Company

# Extract establishment names from URLs
# Apply the function to the Url column
df['Company name'] = df['Yelp URL'].apply(extract_establishment_name)

In [5]:
# Drop the 'Url' and 'Date' columns
df.drop(['Yelp URL', 'Date'], axis=1, inplace=True)

# Reorder the columns to Location, Rating, Review
df = df[['Company name','Location', 'Rating', 'Review Text']]

# Display the modified DataFrame to verify changes
print(df.head())

# Optionally, save the modified DataFrame
df.to_csv('Final Yelp Restaurant Reviews.csv', index=False)

        Company name Location  Rating  \
0  sidney dairy barn   sidney       5   
1  sidney dairy barn   sidney       4   
2  sidney dairy barn   sidney       5   
3  sidney dairy barn   sidney       4   
4  sidney dairy barn   sidney       5   

                                         Review Text  
0  All I can say is they have very good ice cream...  
1  Nice little local place for ice cream.My favor...  
2  A delicious treat on a hot day! Staff was very...  
3  This was great service and a fun crew! I got t...  
4  This is one of my favorite places to get ice c...  


## Preprocessing the data 

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [7]:
#Replacing the ratings with positive and negative values (1-3 negative) (4-5 postive)
sentiment = []
for i in range(len(df['Rating'])):
    if df['Rating'].iloc[i] < 3:
        sentiment.append('negative')
    elif df['Rating'].iloc[i] == 3:
        sentiment.append('neutral')
    else:
        sentiment.append('positive')


In [8]:
df['Sentiment'] = sentiment

In [9]:
df.drop(columns=['Rating'], inplace=True)

In [10]:
df['Sentiment'].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

In [11]:
df.head(5)

Unnamed: 0,Company name,Location,Review Text,Sentiment
0,sidney dairy barn,sidney,All I can say is they have very good ice cream...,positive
1,sidney dairy barn,sidney,Nice little local place for ice cream.My favor...,positive
2,sidney dairy barn,sidney,A delicious treat on a hot day! Staff was very...,positive
3,sidney dairy barn,sidney,This was great service and a fun crew! I got t...,positive
4,sidney dairy barn,sidney,This is one of my favorite places to get ice c...,positive


In [12]:
tfidfVectorizer = TfidfVectorizer(max_features=1000)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('text', tfidfVectorizer, 'Review Text'),
        ('cat', categorical_transformer, ['Company name', 'Location'])
    ])

X = df[['Company name','Location','Review Text']]
y = df['Sentiment']


In [13]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)


### Building the model

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [18]:
model = make_pipeline(preprocessor, LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=1000))

In [19]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [21]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
#print("AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

Accuracy: 0.8545820070363545
Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.62      0.68       774
     neutral       0.55      0.19      0.29       615
    positive       0.88      0.98      0.93      4580

    accuracy                           0.85      5969
   macro avg       0.73      0.60      0.63      5969
weighted avg       0.83      0.85      0.83      5969



### Building Model - Random Forest

In [22]:
randomForestClassifierModel = make_pipeline(preprocessor, RandomForestClassifier())

In [23]:
randomForestClassifierModel.fit(X_train, y_train)
y_pred = randomForestClassifierModel.predict(X_test)

In [24]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
#print("AUC Score:", roc_auc_score(y_test, randomForestClassifierModel.predict_proba(X_test)[:,1]))

Accuracy: 0.8041547997989613
Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.29      0.43       774
     neutral       0.46      0.01      0.02       615
    positive       0.81      1.00      0.89      4580

    accuracy                           0.80      5969
   macro avg       0.68      0.43      0.45      5969
weighted avg       0.77      0.80      0.74      5969

