In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# 2. Load dataset
data = pd.read_csv("dataset.csv")
data['sentiment'] = data['sentiment'].map({'positive':1, 'negative':0})
data.head()


In [None]:
# 3. Preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t not in stop_words and t not in string.punctuation]
    return ' '.join(tokens)

data['cleaned'] = data['review'].apply(preprocess)
data[['review', 'cleaned']].head()


In [None]:
# 4. TF-IDF vectorization
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(data['cleaned'])
y = data['sentiment']


In [None]:
# 5. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# 6. Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
# 7. Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.2f}")


In [None]:
# 8. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative','Positive'], yticklabels=['Negative','Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# 9. Visualize informative words
feature_names = np.array(tfidf.get_feature_names_out())
coefs = model.coef_[0]
top_positive = feature_names[np.argsort(coefs)[-10:]]
top_negative = feature_names[np.argsort(coefs)[:10]]

print("Top Positive Words:", top_positive[::-1])
print("Top Negative Words:", top_negative)