In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
# nlrk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import word_tokenize
import re

In [None]:
# Data loading
df = pd.read_csv('reviews.csv')

In [None]:
# Data Cleaning
def clean_text(text):
  if isinstance(text, str):
    text = re.sub(r'[^a-zA-Z\s]', '', text) # remove punctuations
    text = text.lower() # lowercase
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    cleaned_text = ' '.join(filtered_tokens)
    return text
  return ''

df['cleaned_comments'] = df['comments'].apply(clean_text)

In [None]:
# Data Preparation
df = df[df['cleaned_comments'] != ''] # removing null reviews
X = df['cleaned_comments']
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
# Text Proprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
  tokens = word_tokenize(text)
  filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
  return ' '.join(filtered_tokens)

X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

In [None]:
# Model Training
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [None]:
# Test Set Evaluation
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))