# Import Required Libraries
Import necessary libraries such as numpy, pandas, sklearn, torch, and mlflow.

In [None]:
# Import Required Libraries

import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # For feature extraction
from sklearn.metrics import accuracy_score, classification_report  # For model evaluation
import torch  # For deep learning model development
import torch.nn as nn  # For neural network modules
import torch.optim as optim  # For optimization algorithms
from torch.utils.data import DataLoader, Dataset  # For data loading and batching
import mlflow  # For logging and tracking experiments
import mlflow.pytorch  # For logging PyTorch models in MLflow

# Load and Validate Dataset
Read the 'IMDB Dataset.csv' from the 'data/raw' directory and validate its contents.

In [None]:
# Load and Validate Dataset

# Define the path to the dataset
dataset_path = 'data/raw/IMDB Dataset.csv'

# Load the dataset using pandas
df = pd.read_csv(dataset_path)

# Display the first few rows of the dataset to understand its structure
df.head()

# Check for any missing values in the dataset
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Validate the dataset by checking its shape and basic statistics
print("Dataset shape:", df.shape)
print("Dataset statistics:\n", df.describe())

# Check the distribution of the target variable
print("Distribution of target variable:\n", df['sentiment'].value_counts())

# Ensure the dataset contains the expected columns
expected_columns = ['review', 'sentiment']
assert all(column in df.columns for column in expected_columns), "Dataset does not contain the expected columns."

# Display the data types of each column
print("Data types of each column:\n", df.dtypes)

# Data Preprocessing
Clean, preprocess, and tokenize the text data using pandas and sklearn.

In [None]:
# Data Preprocessing

# Convert the target variable to binary format
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Display the first few rows to verify the conversion
df.head()

# Clean the text data by removing HTML tags, special characters, and converting to lowercase
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

df['review'] = df['review'].apply(clean_text)

# Display the first few rows to verify the cleaning process
df.head()

# Tokenize the text data using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['review']).toarray()

# Display the shape of the tokenized data
print("Shape of tokenized data:", X.shape)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

# Feature Engineering
Create features suitable for text classification tasks, such as TF-IDF vectors.

In [None]:
# Feature Engineering

# Create TF-IDF vectors for the text data
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(df['review']).toarray()

# Display the shape of the TF-IDF data
print("Shape of TF-IDF data:", X_tfidf.shape)

# Split the TF-IDF data into training and testing sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, df['sentiment'], test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets for TF-IDF data
print("Training set shape (TF-IDF):", X_train_tfidf.shape, y_train_tfidf.shape)
print("Testing set shape (TF-IDF):", X_test_tfidf.shape, y_test_tfidf.shape)

# Model Development
Build and train models, including a baseline model and a deep learning model that leverages the on-prem GPU using PyTorch.

In [None]:
# Model Development

# Baseline Model: Logistic Regression
from sklearn.linear_model import LogisticRegression

# Initialize MLflow experiment
mlflow.set_experiment("Text Classification Pipeline")

# Start MLflow run for baseline model
with mlflow.start_run(run_name="Logistic Regression Baseline"):
    # Initialize and train the logistic regression model
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train_tfidf, y_train_tfidf)
    
    # Make predictions on the test set
    y_pred_lr = lr_model.predict(X_test_tfidf)
    
    # Evaluate the model
    accuracy_lr = accuracy_score(y_test_tfidf, y_pred_lr)
    report_lr = classification_report(y_test_tfidf, y_pred_lr)
    
    # Log metrics and model
    mlflow.log_metric("accuracy", accuracy_lr)
    mlflow.log_text(report_lr, "classification_report.txt")
    mlflow.sklearn.log_model(lr_model, "logistic_regression_model")
    
    # Print evaluation results
    print("Logistic Regression Model Accuracy:", accuracy_lr)
    print("Classification Report:\n", report_lr)

# Deep Learning Model: Simple Neural Network using PyTorch
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# Create DataLoader for training and testing sets
train_dataset = TextClassificationDataset(X_train_tfidf, y_train_tfidf)
test_dataset = TextClassificationDataset(X_test_tfidf, y_test_tfidf)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_dim = X_train_tfidf.shape[1]
model = SimpleNN(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Check if GPU is available and move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Start MLflow run for deep learning model
with mlflow.start_run(run_name="Simple Neural Network"):
    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
        mlflow.log_metric("loss", avg_loss, step=epoch)
    
    # Evaluation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy and classification report
    accuracy_nn = accuracy_score(all_labels, all_preds)
    report_nn = classification_report(all_labels, all_preds)
    
    # Log metrics and model
    mlflow.log_metric("accuracy", accuracy_nn)
    mlflow.log_text(report_nn, "classification_report.txt")
    mlflow.pytorch.log_model(model, "simple_nn_model")
    
    # Print evaluation results
    print("Simple Neural Network Model Accuracy:", accuracy_nn)
    print("Classification Report:\n", report_nn)

# Model Evaluation
Evaluate the models with standard metrics such as accuracy, precision, recall, and F1-score.

In [None]:
# Model Evaluation

# Evaluate Logistic Regression Model
print("Evaluating Logistic Regression Model...")
accuracy_lr = accuracy_score(y_test_tfidf, y_pred_lr)
report_lr = classification_report(y_test_tfidf, y_pred_lr, target_names=['negative', 'positive'])

print("Logistic Regression Model Accuracy:", accuracy_lr)
print("Classification Report:\n", report_lr)

# Evaluate Simple Neural Network Model
print("Evaluating Simple Neural Network Model...")
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy_nn = accuracy_score(all_labels, all_preds)
report_nn = classification_report(all_labels, all_preds, target_names=['negative', 'positive'])

print("Simple Neural Network Model Accuracy:", accuracy_nn)
print("Classification Report:\n", report_nn)

# Log metrics and classification reports to MLflow
with mlflow.start_run(run_name="Model Evaluation"):
    mlflow.log_metric("Logistic Regression Accuracy", accuracy_lr)
    mlflow.log_text(report_lr, "logistic_regression_classification_report.txt")
    mlflow.log_metric("Simple Neural Network Accuracy", accuracy_nn)
    mlflow.log_text(report_nn, "simple_nn_classification_report.txt")

# Inference on New Data
Demonstrate predictions on new/unseen data using the trained models.

In [None]:
# Inference on New Data

# Define new/unseen data for inference
new_reviews = [
    "The movie was fantastic! I really enjoyed it.",
    "Absolutely terrible. It was a waste of time.",
    "An average film with some good moments.",
    "I loved the acting and the storyline was gripping.",
    "Not my cup of tea. I found it quite boring."
]

# Clean the new reviews using the same preprocessing function
new_reviews_cleaned = [clean_text(review) for review in new_reviews]

# Transform the new reviews using the same TF-IDF vectorizer
new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews_cleaned).toarray()

# Logistic Regression Model Inference
print("Logistic Regression Model Predictions:")
lr_predictions = lr_model.predict(new_reviews_tfidf)
lr_predictions_labels = ['positive' if pred == 1 else 'negative' for pred in lr_predictions]
for review, label in zip(new_reviews, lr_predictions_labels):
    print(f"Review: {review}\nPredicted Sentiment: {label}\n")

# Simple Neural Network Model Inference
print("Simple Neural Network Model Predictions:")
model.eval()
new_reviews_tensor = torch.tensor(new_reviews_tfidf, dtype=torch.float32).to(device)
with torch.no_grad():
    nn_outputs = model(new_reviews_tensor)
    _, nn_predictions = torch.max(nn_outputs, 1)
nn_predictions_labels = ['positive' if pred == 1 else 'negative' for pred in nn_predictions.cpu().numpy()]
for review, label in zip(new_reviews, nn_predictions_labels):
    print(f"Review: {review}\nPredicted Sentiment: {label}\n")

# Log inference results to MLflow
with mlflow.start_run(run_name="Inference on New Data"):
    for i, review in enumerate(new_reviews):
        mlflow.log_text(review, f"review_{i}.txt")
        mlflow.log_param(f"lr_prediction_{i}", lr_predictions_labels[i])
        mlflow.log_param(f"nn_prediction_{i}", nn_predictions_labels[i])

# MLflow Logging and Tracking
Log and track every step, from data preprocessing to inference, using the MLflow platform.

In [None]:
# MLflow Logging and Tracking

# Initialize MLflow experiment
mlflow.set_experiment("Text Classification Pipeline")

# Start MLflow run for baseline model
with mlflow.start_run(run_name="Logistic Regression Baseline"):
    # Initialize and train the logistic regression model
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train_tfidf, y_train_tfidf)
    
    # Make predictions on the test set
    y_pred_lr = lr_model.predict(X_test_tfidf)
    
    # Evaluate the model
    accuracy_lr = accuracy_score(y_test_tfidf, y_pred_lr)
    report_lr = classification_report(y_test_tfidf, y_pred_lr)
    
    # Log metrics and model
    mlflow.log_metric("accuracy", accuracy_lr)
    mlflow.log_text(report_lr, "classification_report.txt")
    mlflow.sklearn.log_model(lr_model, "logistic_regression_model")
    
    # Print evaluation results
    print("Logistic Regression Model Accuracy:", accuracy_lr)
    print("Classification Report:\n", report_lr)

# Deep Learning Model: Simple Neural Network using PyTorch
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)

# Create DataLoader for training and testing sets
train_dataset = TextClassificationDataset(X_train_tfidf, y_train_tfidf)
test_dataset = TextClassificationDataset(X_test_tfidf, y_test_tfidf)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_dim = X_train_tfidf.shape[1]
model = SimpleNN(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Check if GPU is available and move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Start MLflow run for deep learning model
with mlflow.start_run(run_name="Simple Neural Network"):
    # Training loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        avg_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
        mlflow.log_metric("loss", avg_loss, step=epoch)
    
    # Evaluation loop
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate accuracy and classification report
    accuracy_nn = accuracy_score(all_labels, all_preds)
    report_nn = classification_report(all_labels, all_preds)
    
    # Log metrics and model
    mlflow.log_metric("accuracy", accuracy_nn)
    mlflow.log_text(report_nn, "classification_report.txt")
    mlflow.pytorch.log_model(model, "simple_nn_model")
    
    # Print evaluation results
    print("Simple Neural Network Model Accuracy:", accuracy_nn)
    print("Classification Report:\n", report_nn)

# Model Evaluation

# Evaluate Logistic Regression Model
print("Evaluating Logistic Regression Model...")
accuracy_lr = accuracy_score(y_test_tfidf, y_pred_lr)
report_lr = classification_report(y_test_tfidf, y_pred_lr, target_names=['negative', 'positive'])

print("Logistic Regression Model Accuracy:", accuracy_lr)
print("Classification Report:\n", report_lr)

# Evaluate Simple Neural Network Model
print("Evaluating Simple Neural Network Model...")
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy_nn = accuracy_score(all_labels, all_preds)
report_nn = classification_report(all_labels, all_preds, target_names=['negative', 'positive'])

print("Simple Neural Network Model Accuracy:", accuracy_nn)
print("Classification Report:\n", report_nn)

# Log metrics and classification reports to MLflow
with mlflow.start_run(run_name="Model Evaluation"):
    mlflow.log_metric("Logistic Regression Accuracy", accuracy_lr)
    mlflow.log_text(report_lr, "logistic_regression_classification_report.txt")
    mlflow.log_metric("Simple Neural Network Accuracy", accuracy_nn)
    mlflow.log_text(report_nn, "simple_nn_classification_report.txt")

# Inference on New Data

# Define new/unseen data for inference
new_reviews = [
    "The movie was fantastic! I really enjoyed it.",
    "Absolutely terrible. It was a waste of time.",
    "An average film with some good moments.",
    "I loved the acting and the storyline was gripping.",
    "Not my cup of tea. I found it quite boring."
]

# Clean the new reviews using the same preprocessing function
new_reviews_cleaned = [clean_text(review) for review in new_reviews]

# Transform the new reviews using the same TF-IDF vectorizer
new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews_cleaned).toarray()

# Logistic Regression Model Inference
print("Logistic Regression Model Predictions:")
lr_predictions = lr_model.predict(new_reviews_tfidf)
lr_predictions_labels = ['positive' if pred == 1 else 'negative' for pred in lr_predictions]
for review, label in zip(new_reviews, lr_predictions_labels):
    print(f"Review: {review}\nPredicted Sentiment: {label}\n")

# Simple Neural Network Model Inference
print("Simple Neural Network Model Predictions:")
model.eval()
new_reviews_tensor = torch.tensor(new_reviews_tfidf, dtype=torch.float32).to(device)
with torch.no_grad():
    nn_outputs = model(new_reviews_tensor)
    _, nn_predictions = torch.max(nn_outputs, 1)
nn_predictions_labels = ['positive' if pred == 1 else 'negative' for pred in nn_predictions.cpu().numpy()]
for review, label in zip(new_reviews, nn_predictions_labels):
    print(f"Review: {review}\nPredicted Sentiment: {label}\n")

# Log inference results to MLflow
with mlflow.start_run(run_name="Inference on New Data"):
    for i, review in enumerate(new_reviews):
        mlflow.log_text(review, f"review_{i}.txt")
        mlflow.log_param(f"lr_prediction_{i}", lr_predictions_labels[i])
        mlflow.log_param(f"nn_prediction_{i}", nn_predictions_labels[i])

# Documentation and Explanations
Provide detailed documentation, including explanations for workflow logic, key decisions, MLflow configurations, and encountered challenges.

In [None]:
# Documentation and Explanations

"""
## Workflow Logic and Design

This notebook demonstrates a comprehensive end-to-end machine learning pipeline for a text classification task using the "IMDB Dataset.csv" dataset. The pipeline includes the following steps:

1. **Data Loading**: Reading and validating the dataset to ensure it is suitable for analysis.
2. **Data Preprocessing**: Cleaning, preprocessing, and tokenizing the text data to prepare it for feature extraction.
3. **Feature Engineering**: Creating features suitable for text classification tasks using techniques like CountVectorizer and TF-IDF.
4. **Model Development**: Building and training models, including a baseline logistic regression model and a deep learning model using PyTorch.
5. **Evaluation**: Evaluating the models using standard metrics such as accuracy and classification reports.
6. **Inference**: Demonstrating predictions on new/unseen data.
7. **MLflow Logging and Tracking**: Logging and tracking every step of the pipeline using MLflow for reproducibility and monitoring.

## Key Decisions

- **Data Cleaning**: HTML tags and special characters were removed from the text data, and all text was converted to lowercase to standardize the input.
- **Feature Extraction**: Both CountVectorizer and TF-IDF were used to create features from the text data. TF-IDF was chosen for model training due to its ability to capture the importance of words in the context of the entire dataset.
- **Model Selection**: A logistic regression model was chosen as the baseline due to its simplicity and interpretability. A simple neural network was implemented using PyTorch to leverage the on-prem GPU for deep learning.
- **Evaluation Metrics**: Accuracy and classification reports were used to evaluate the models, providing insights into their performance on the test set.
- **MLflow Integration**: MLflow was used to log and track experiments, including metrics, models, and artifacts, ensuring reproducibility and easy monitoring of the pipeline.

## MLflow Configurations

- **Experiment Initialization**: The MLflow experiment was initialized with the name "Text Classification Pipeline".
- **Run Management**: Separate MLflow runs were created for each model and stage of the pipeline, allowing for organized tracking of experiments.
- **Logging Metrics and Artifacts**: Metrics such as accuracy and loss were logged for each model. Classification reports and trained models were also logged as artifacts.
- **GPU Utilization**: The deep learning model was configured to utilize the on-prem GPU if available, ensuring efficient training.

## Challenges and Resolutions

- **Data Imbalance**: The dataset had a balanced distribution of positive and negative reviews, so no additional steps were needed to address data imbalance.
- **Text Preprocessing**: Cleaning and preprocessing text data can be challenging due to the presence of various special characters and HTML tags. A robust cleaning function was implemented to handle these issues.
- **Model Training on GPU**: Ensuring the deep learning model utilized the on-prem GPU required careful configuration and testing. The model was successfully trained on the GPU, significantly reducing training time.
- **MLflow Integration**: Integrating MLflow in a secure, on-prem environment required configuring the MLflow tracking server and ensuring all logs and artifacts were stored locally.

## Conclusion

This notebook provides a detailed and reproducible pipeline for text classification using the "IMDB Dataset.csv" dataset. By leveraging MLflow for logging and tracking, the pipeline ensures transparency and reproducibility of experiments. The use of both traditional machine learning and deep learning models demonstrates the flexibility and scalability of the pipeline for various text classification tasks.
"""