In [None]:
# Import Libraries
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Load & Clean Data
df = pd.read_csv('yelp.csv')  # Make sure yelp.csv is in your working directory

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())  # Remove punctuation & lowercase
    return ' '.join([word for word in text.split() if word not in stopwords.words('english')])

df['cleaned_text'] = df['text'].apply(clean_text)

# Feature Extraction
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['stars']

# Train Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
