In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Load your Yelp dataset
df = pd.read_csv('yelp.csv')

In [None]:
# Select only the 'text' and 'stars' columns
df = df[['text', 'stars']]

In [None]:
df.head()

Unnamed: 0,text,stars
0,My wife took me here on my birthday for breakf...,5
1,I have no idea why some people give bad review...,5
2,love the gyro plate. Rice is so good and I als...,4
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5
4,General Manager Scott Petello is a good egg!!!...,5


In [None]:
# Label encoding: Convert stars to sentiment (1 = Happy, 0 = Sad)
df['sentiment'] = df['stars'].apply(lambda x: 1 if x >= 3 else 0)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       10000 non-null  object
 1   stars      10000 non-null  int64 
 2   sentiment  10000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 234.5+ KB


In [None]:
df.head()

Unnamed: 0,text,stars,sentiment
0,My wife took me here on my birthday for breakf...,5,1
1,I have no idea why some people give bad review...,5,1
2,love the gyro plate. Rice is so good and I als...,4,1
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",5,1
4,General Manager Scott Petello is a good egg!!!...,5,1


In [None]:
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,8324
0,1676


In [None]:
# Feature: text, Label: sentiment
X = df['text']
y = df['sentiment']

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert text to numerical data using TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Build a Logistic Regression model
model = LogisticRegression()

In [None]:
# Train the model
model.fit(X_train_tfidf, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
# Output the accuracy and classification report
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

Accuracy: 0.8890
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.32      0.47       308
           1       0.89      0.99      0.94      1692

    accuracy                           0.89      2000
   macro avg       0.89      0.66      0.70      2000
weighted avg       0.89      0.89      0.87      2000

