In [1]:
### Note that for this project, I referenced code from Professor Tallman an Daniel Bourke from his freeCodeCamp.com video! https://www.youtube.com/channel/UCr8O8l5cCX85Oem1d18EezQ

import torch

device = 'cpu'
gpu_count = 0

# Check for GPU availability
gpu_available = torch.cuda.is_available()
print(f'GPU Availability: {gpu_available}')

if gpu_available:
    device = 'cuda'
    gpu_count = torch.cuda.device_count()
    print(f'Number of GPUs available: {gpu_count}')
    print(f'Number of CPU cores: {torch.get_num_threads()}')
    
    # Print each GPU's name
    for i in range(gpu_count):
        print(f'GPU {i}: {torch.cuda.get_device_name(i)}')

# Print selected device
print(f'Device in use: {device}')

GPU Availability: True
Number of GPUs available: 1
Number of CPU cores: 8
GPU 0: NVIDIA GeForce RTX 3060 Ti
Device in use: cuda


In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler

test_data = pd.read_csv(r"data\test_data.csv.gzip", compression='gzip')
train_data = pd.read_csv(r"data\train_data.csv.gzip", compression='gzip')

df = pd.concat([test_data, train_data], ignore_index=True)

X = df.loc[:, df.columns != 'model_labels']
y = df["model_labels"]
y = y.map({'pos': 1, 'neg': 0})


scaler = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (37500, 10000)
Shape of X_test: (12500, 10000)
Shape of y_train: (37500,)
Shape of y_test: (12500,)


In [3]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.to_numpy(), dtype=torch.float32)

# Ensure y_train_tensor and y_test_tensor are integer class labels (1D)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)  # Long type for class indices
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.float32)

BATCH_SIZE = 64
# Create PyTorch datasets and data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define the PyTorch model
class SentimentAnalysisModel(nn.Module):
    def __init__(self, input_shape, num_classes):
        super().__init__()
        
        self.relu = nn.ReLU()
        
        # had to add dropout and batch normalization to stop it from overfitting!
        self.layer_stack = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.BatchNorm1d(256),
            self.relu,
            nn.Linear(256, 512),
            nn.Dropout(p=0.5),
            nn.BatchNorm1d(512),
            self.relu,
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            self.relu,
            nn.Linear(256, 1),
            nn.Dropout(p=0.2)
        )
        
    
    def forward(self, x):
        return self.layer_stack(x)

In [4]:
num_classes = torch.unique(y_train_tensor).size(0)

model = SentimentAnalysisModel(
    input_shape=X_train_tensor.shape[1], 
    num_classes=num_classes
).to(device)

In [5]:
loss_fn = nn.BCEWithLogitsLoss()

optimizer = torch.optim.Adam(params=model.parameters(),
                            lr=0.001)

In [6]:
def loss_has_converged(loss, min_delta=0.03, stable_epoch_count=2):
    '''
    Determines whether the learning loss has converged to a stable value
    '''
    
    # These variables persist across function calls!
    if not hasattr(loss_has_converged, "counter"):
        loss_has_converged.counter = 0
    if not hasattr(loss_has_converged, "best_loss"):
        loss_has_converged.best_loss = float('inf')

    # Is the loss continuing to improve (e.g., get smaller)?
    if loss_has_converged.best_loss - loss > min_delta:
        loss_has_converged.best_loss = loss
        loss_has_converged.counter = 0

    # Loss has not improved; 'patience' in a row means convergence
    else:
        loss_has_converged.counter += 1
        if loss_has_converged.counter >= stable_epoch_count:
            return True
    
    return False

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def train_step(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               accuracy_fn,
               device: torch.device = device):
    '''
    Performs a single round of learning with the training data, updating
    the weights and biases after it is completed according to the given
    loss function and optimization object.
    '''
    train_loss, train_acc = 0, 0

    model.train()

    # Add a loop to loop through the training batches
    for batch, (X, y) in enumerate(data_loader):
        # Put data on gpu
        X, y = X.to(device), y.to(device)

        # 1. Forward pass
        y_pred = model(X)

        # 2. Calculate loss and accuracy
        loss = loss_fn(y_pred.squeeze(), y)
        train_loss += loss

        # convert logits to probabilities then to 0 or 1
        y_pred_labels = torch.round(torch.sigmoid(y_pred))

        train_acc += accuracy_fn(y_true=y.cpu().detach().numpy(),
                                y_pred=y_pred_labels.cpu().detach().numpy())

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

    # Divide total train loss and acc by length of train dataloader
    train_loss /= len(data_loader)
    train_acc /= len(data_loader)
    print(f"Train loss: {train_loss:.5f} | Train acc: {train_acc:.2f}%")

    return train_loss

In [8]:
def test_step(model: torch.nn.Module,
              data_loader: torch.utils.data.DataLoader, 
              loss_fn: torch.nn.Module,
              accuracy_fn,
              device: torch.device = device):
    """Performs a testing loop step on model going over data_loader."""
    test_loss, test_acc = 0, 0
    
    # Put the model in eval mode
    model.eval()

    # Turn on inference mode context manager
    with torch.inference_mode():
        for X, y in data_loader:
            # Send the data to the target device
            X, y = X.to(device), y.to(device)

            # 1. Forward pass (outputs raw logits)
            test_pred = model(X)

            # 2. Calculate the loss/acc
            test_loss += loss_fn(test_pred.squeeze(), y)

            # convert logits to probabilities then to 0 or 1
            test_pred_labels = torch.round(torch.sigmoid(test_pred))

            test_acc += accuracy_fn(y_true=y.cpu().detach().numpy(),
                                    y_pred=test_pred_labels.cpu().detach().numpy()) 

        # Adjust metrics and print out
        test_loss /= len(data_loader)
        test_acc /= len(data_loader)
        print(f"Test loss: {test_loss:.5f} | Test acc: {test_acc:.2f}%\n")

In [9]:
# torch.manual_seed(42)

epochs = 50

# Create a optimization and evaluation loop using train_step() and test_step()
for epoch in range(epochs):
  print(f"Epoch: {epoch}\n----------")

  loss = train_step(model=model,
             data_loader=train_loader,
             loss_fn=loss_fn,
             optimizer=optimizer,
             accuracy_fn=accuracy_score,
             device=device)
  
  test_step(model=model,
            data_loader=test_loader,
            loss_fn=loss_fn,
            accuracy_fn=accuracy_score,
            device=device)
  
  if loss_has_converged(loss):
        print(f"Early stopping triggered at epoch {epoch+1}!")
        break

Epoch: 0
----------
Train loss: 0.39896 | Train acc: 0.79%
Test loss: 0.28364 | Test acc: 0.89%

Epoch: 1
----------
Train loss: 0.29654 | Train acc: 0.84%
Test loss: 0.27978 | Test acc: 0.89%

Epoch: 2
----------
Train loss: 0.24995 | Train acc: 0.86%
Test loss: 0.32697 | Test acc: 0.87%

Epoch: 3
----------
Train loss: 0.21641 | Train acc: 0.87%
Test loss: 0.37655 | Test acc: 0.87%

Epoch: 4
----------
Train loss: 0.19878 | Train acc: 0.88%
Test loss: 0.41436 | Test acc: 0.88%

Epoch: 5
----------
Train loss: 0.18232 | Train acc: 0.88%
Test loss: 0.42924 | Test acc: 0.87%

Epoch: 6
----------
Train loss: 0.17728 | Train acc: 0.89%
Test loss: 0.43894 | Test acc: 0.87%

Epoch: 7
----------
Train loss: 0.17095 | Train acc: 0.89%
Test loss: 0.47880 | Test acc: 0.87%

Early stopping triggered at epoch 8!


In [10]:
def eval_model(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module, 
               accuracy_fn,
               device=device):
    """Returns a dictionary containing the results of model predicting on data_loader."""
    loss, acc = 0, 0
    model.eval()
    with torch.inference_mode():
        for X, y in data_loader:
            # Make our data device agnostic
            X, y = X.to(device), y.to(device)
            # Make predictions
            y_pred = model(X)

            # Accumulate the loss and acc values per batch
            loss += loss_fn(y_pred.squeeze(), y)

            # convert logits to probabilities then to 0 or 1
            y_pred_labels = torch.round(torch.sigmoid(y_pred))

            acc += accuracy_fn(y_true=y.cpu().detach().numpy(),
                                y_pred=y_pred_labels.cpu().detach().numpy())

        # Scale loss and acc to find the average loss/acc per batch
        loss /= len(data_loader)
        acc /= len(data_loader)

    return {"model_name": model.__class__.__name__,
            "model_loss": loss.item(),
            "model_acc": acc}

In [11]:
model_results = eval_model(model=model,
                            data_loader=test_loader,
                            loss_fn=loss_fn,
                            accuracy_fn=accuracy_score,
                            device=device)
model_results

{'model_name': 'SentimentAnalysisModel',
 'model_loss': 0.47879886627197266,
 'model_acc': 0.8711734693877551}

In [12]:
# save the models parameters for later use!
torch.save(model.state_dict(), "model.pth")