In [None]:
# Step 1: Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk

# Download NLTK stopwords (if you plan to use it)
nltk.download('stopwords')
from nltk.corpus import stopwords

# Step 2: Load the data

# Load the CSV file into a pandas DataFrame
df = pd.read_csv("data/movie_reviews.csv")

# Display the first few rows of the dataset
df.head()

# Step 3: Data Preprocessing

# Let's check for missing values
df.isnull().sum()

# Remove any rows with missing reviews or sentiments
df = df.dropna()

# Convert sentiment labels to numeric values (positive -> 1, negative -> 0)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Step 4: Text Preprocessing

# Convert reviews to lowercase to ensure uniformity
df['review'] = df['review'].str.lower()

# Remove punctuation and special characters (you can enhance this step)
df['review'] = df['review'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)

# Remove stopwords (optional)
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Step 5: Visualizing the data

# Visualize the distribution of sentiments
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Distribution')
plt.show()

# Step 6: Feature Extraction (Text Vectorization)

# Create a TfidfVectorizer to convert text to numerical features
vectorizer = TfidfVectorizer(max_features=5000)

# Split the data into features (X) and target labels (y)
X = df['review']
y = df['sentiment']

# Convert text reviews into numerical features (TF-IDF)
X_tfidf = vectorizer.fit_transform(X)

# Step 7: Train-Test Split

# Split the data into training and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Step 8: Build and Train the Model

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 9: Model Evaluation

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.title("Confusion Matrix")
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))
