# Setup & Dependencies

In [None]:
!python.exe -m pip install --upgrade pip
!pip3 install -r ../../requirements.txt

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re
import warnings

nltk.download('stopwords')
nltk.download('wordnet')
warnings.filterwarnings('ignore')

- Importing essential libraries for text processing (nltk), machine learning (sklearn), data visualization (seaborn, matplotlib), and data manipulation (pandas, numpy).
- Downloading NLTK stopwords and lemmatizer models to preprocess text data.
- Suppressing unnecessary warnings for better readability.

# Reading and Displaying the Dataset

In [None]:
# Reading the training data
df = pd.read_csv('../assets/datasets/IMDb_Dataset.csv')
print(df.shape)
df.head(10)

In [None]:
# Descriptive statistics
df.describe()

In [None]:
# Sentiment distribution
df.sentiment.value_counts()

In [None]:
# Reformating
# Rename 'sentiment' column to 'label'
df.rename(columns={'sentiment': 'label'}, inplace=True)
# Convert labels to binary values
df.label = df.label.apply(lambda x: 1 if x == 'positive' else 0)
df.head(10)

- Load the IMDb dataset containing movie reviews and their corresponding sentiments.
- Print the shape of the dataset to understand the number of rows and columns.
- Display the first 10 rows to preview the dataset.

# Data Preparation

In [8]:
# Cleaning
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-letters and non-digits
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text.lower()

df.review = df.review.apply(clean_text)

Remove URLs, HTML tags, and special characters. Also converts text to lowercase for uniformity.

In [None]:
#Remove stopwords
stopwords = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered = [word for word in words if word not in stopwords]
    return " ".join(filtered)

df.review = df.review.apply(remove_stopwords)

df.head(10)

Define a function remove_stopwords to filter out common words (e.g., "the", "and") that do not contribute to sentiment.

In [None]:
# Lemmatize
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

df.review = df.review.apply(lemmatize_text)

df.head(10)

Initialize the WordNet lemmatizer to reduce words to their base forms (e.g., "running" â†’ "run").

# Modeling

In [None]:
# Splitting and shuffling
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_texts = train_data['review']
train_labels = train_data['label']

test_texts = test_data['review']
test_labels = test_data['label']

print(train_texts.shape, test_texts.shape)
train_texts.head(10)


- Split the dataset into training (80%) and testing (20%) subsets.
- Shuffle the data with a fixed random state to ensure reproducibility.
- Display the shape of the train and test subsets and preview some training reviews.

In [None]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_train = tfidf.fit_transform(train_texts)
X_test = tfidf.transform(test_texts)

# Logistic Regression model
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, train_labels)

Use TF-IDF vectorization to convert text data into numerical features. Generate unigrams and bigrams with a maximum of 5,000 features.

In [None]:
# Predictions and evaluation
y_pred = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(test_labels, y_pred))
print("Classification Report:\n", classification_report(test_labels, y_pred))

# Graphs

In [None]:

y_pred = lr.predict(X_test)

report = classification_report(test_labels, y_pred, output_dict=True)

categories = list(report.keys())[:2]
precision = [report[cat]['precision'] for cat in categories]
recall = [report[cat]['recall'] for cat in categories]
f1_score = [report[cat]['f1-score'] for cat in categories]

x = np.arange(len(categories))
width = 0.25

fig, ax = plt.subplots(figsize=(8, 5))

bars1 = ax.bar(x - width, precision, width, label='Precision', color='skyblue')
bars2 = ax.bar(x, recall, width, label='Recall', color='orange')
bars3 = ax.bar(x + width, f1_score, width, label='F1 Score', color='green')

ax.set_xlabel('Sentiment')
ax.set_ylabel('Scores')
ax.set_title('Logistic Regression - Precision, Recall, and F1 Scores by Sentiment')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()

for bars in [bars1, bars2, bars3]:
    ax.bar_label(bars, fmt='%.2f', padding=3)

plt.tight_layout()
plt.show()

Visualize precision, recall, and F1 scores for positive and negative sentiments using a bar chart.

In [None]:
conf_matrix = confusion_matrix(test_labels, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title("Logistic Regression - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

Plot the confusion matrix to visualize the model's performance in classifying positive and negative sentiments.





