# Build neural network model from scratch

Predict summaries using LSTM build from scratch using PyTorch

In [1]:
# don't need to run this code if not on Google Colab
# Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# !git clone https://github.com/p-parks/AAI-590_Capstone.git

In [3]:
# !pip install evaluate
# !pip install rouge_score

In [4]:
# %% Import necessary libraries
import pandas as pd
import numpy as np
import time
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load the dataset

In [5]:
dataset_dir = 'datasets'
df = pd.read_csv(f'{dataset_dir}/podcast_with_summary.csv')
df_train = pd.read_csv(f'{dataset_dir}/podcast_with_summary_train.csv')
df_test = pd.read_csv(f'{dataset_dir}/podcast_with_summary_test.csv')

# df = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/AAI590_Capstone/Datasets/podcast_with_summary.csv')
# df_train = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/AAI590_Capstone/Datasets/podcast_with_summary_train.csv')
# df_test = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/AAI590_Capstone/Datasets/podcast_with_summary_test.csv')

In [6]:
# the lengths of input and output must be the same for out model
max_length = 1024

# Use Hugging Face tokenizer
model_name = "bert-base-uncased"  # Replace with a model suitable for tokenization
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_text(texts, max_length):
    return tokenizer(
        texts,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

# Tokenize inputs and summaries
# input_tokens = tokenize_text(df['text_short'].tolist(), max_length)
# summary_tokens = tokenize_text(df['summary'].tolist(), max_length)

# X = input_tokens['input_ids']
# Y = summary_tokens['input_ids']

In [7]:
# Tokenize inputs and summaries
input_tokens = tokenize_text(df['text'].tolist(), max_length)
summary_tokens = tokenize_text(df['summary'].tolist(), max_length)

train_input_tokens = tokenize_text(df_train['text'].tolist(), max_length)
train_summary_tokens = tokenize_text(df_train['summary'].tolist(), max_length)
test_input_tokens = tokenize_text(df_test['text'].tolist(), max_length)
test_summary_tokens = tokenize_text(df_test['summary'].tolist(), max_length)

X = input_tokens['input_ids']
Y = summary_tokens['input_ids']

In [8]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
X_train = train_input_tokens['input_ids']
Y_train = train_summary_tokens['input_ids']
X_test = test_input_tokens['input_ids']
Y_test = test_summary_tokens['input_ids']

In [9]:
#Create PyTorch Dataset to load data
class TextSummaryDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

train_dataset = TextSummaryDataset(X_train, Y_train)
test_dataset = TextSummaryDataset(X_test, Y_test)

# I experimented with different batch sizes and found 32 to be the best
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [10]:
X = input_tokens['input_ids']
Y = summary_tokens['input_ids']

# Verify dimensions
print(f"Input shape: {X.shape}, Target shape: {Y.shape}")

Input shape: torch.Size([319, 1024]), Target shape: torch.Size([319, 1024])


In [11]:
print(tokenizer.decode(X[0]))
print(tokenizer.decode(Y[0]))

[CLS] as part of mit course 6s099, artificial general intelligence, i ' ve gotten the chance to sit down with max tegmark. he is a professor here at mit. he ' s a physicist, spent a large part of his career studying the mysteries of our cosmological universe. but he ' s also studied and delved into the beneficial possibilities and the existential risks of artificial intelligence. amongst many other things, he is the cofounder of the future of life institute, author of two books, both of which i highly recommend. first, our mathematical universe. second is life 3. 0. he ' s truly an out of the box thinker and a fun personality, so i really enjoy talking to him. if you ' d like to see more of these videos in the future, please subscribe and also click the little bell icon to make sure you don ' t miss any videos. also, twitter, linkedin, agi. mit. edu if you wanna watch other lectures or conversations like this one. better yet, go read max ' s book, life 3. 0. chapter seven on goals is m

## Build the LSTM

In [12]:

class LSTMSummarizer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super(LSTMSummarizer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm = nn.LSTM(embedding_dim, hidden_units, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_units * 2, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out)  # Predict for each time step
        return out

embedding_dim = 128
hidden_units = 256
vocab_size = tokenizer.vocab_size
model = LSTMSummarizer(vocab_size, embedding_dim, hidden_units).to('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
from torch.optim.lr_scheduler import StepLR
# I was trying to use mixed precision training but was unable to get it to work
from torch.cuda.amp import autocast, GradScaler

# Define optimizer and loss function
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
device = 'cuda' if torch.cuda.is_available() else 'cpu'

scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

def train_model(model, train_loader, criterion, optimizer, scheduler, epochs=10):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for inputs, targets in tqdm(train_loader):
            # Move inputs and targets to gpu
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()

            outputs = model(inputs)

            outputs = outputs.view(-1, outputs.size(-1))
            targets = targets.view(-1)

            loss = criterion(outputs, targets)
            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()

        scheduler.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_loader):.4f}")



In [14]:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
epochs = 100
train_model(model, train_loader, criterion, optimizer, scheduler, epochs=epochs)

100%|██████████| 8/8 [00:02<00:00,  3.24it/s]


Epoch 1/100, Loss: 10.3167


100%|██████████| 8/8 [00:01<00:00,  5.90it/s]


Epoch 2/100, Loss: 10.2759


100%|██████████| 8/8 [00:01<00:00,  6.14it/s]


Epoch 3/100, Loss: 10.2321


100%|██████████| 8/8 [00:01<00:00,  6.14it/s]


Epoch 4/100, Loss: 10.1772


100%|██████████| 8/8 [00:01<00:00,  5.96it/s]


Epoch 5/100, Loss: 10.1001


100%|██████████| 8/8 [00:01<00:00,  6.15it/s]


Epoch 6/100, Loss: 10.0352


100%|██████████| 8/8 [00:01<00:00,  6.18it/s]


Epoch 7/100, Loss: 10.0214


100%|██████████| 8/8 [00:01<00:00,  6.08it/s]


Epoch 8/100, Loss: 10.0067


100%|██████████| 8/8 [00:01<00:00,  6.17it/s]


Epoch 9/100, Loss: 9.9908


100%|██████████| 8/8 [00:01<00:00,  6.63it/s]


Epoch 10/100, Loss: 9.9743


100%|██████████| 8/8 [00:01<00:00,  6.37it/s]


Epoch 11/100, Loss: 9.9634


100%|██████████| 8/8 [00:01<00:00,  6.16it/s]


Epoch 12/100, Loss: 9.9617


100%|██████████| 8/8 [00:01<00:00,  6.06it/s]


Epoch 13/100, Loss: 9.9596


100%|██████████| 8/8 [00:01<00:00,  6.12it/s]


Epoch 14/100, Loss: 9.9582


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 15/100, Loss: 9.9566


100%|██████████| 8/8 [00:01<00:00,  6.11it/s]


Epoch 16/100, Loss: 9.9553


100%|██████████| 8/8 [00:01<00:00,  6.07it/s]


Epoch 17/100, Loss: 9.9553


100%|██████████| 8/8 [00:01<00:00,  6.13it/s]


Epoch 18/100, Loss: 9.9551


100%|██████████| 8/8 [00:01<00:00,  6.15it/s]


Epoch 19/100, Loss: 9.9549


100%|██████████| 8/8 [00:01<00:00,  6.15it/s]


Epoch 20/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.44it/s]


Epoch 21/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.16it/s]


Epoch 22/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.16it/s]


Epoch 23/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.12it/s]


Epoch 24/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.16it/s]


Epoch 25/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.15it/s]


Epoch 26/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 27/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 28/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.13it/s]


Epoch 29/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.05it/s]


Epoch 30/100, Loss: 9.9543


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 31/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.18it/s]


Epoch 32/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  5.97it/s]


Epoch 33/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 34/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 35/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.08it/s]


Epoch 36/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.03it/s]


Epoch 37/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.12it/s]


Epoch 38/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.13it/s]


Epoch 39/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 40/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.03it/s]


Epoch 41/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.16it/s]


Epoch 42/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 43/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 44/100, Loss: 9.9544


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 45/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 46/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 47/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.02it/s]


Epoch 48/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.18it/s]


Epoch 49/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 50/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 51/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 52/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 53/100, Loss: 9.9544


100%|██████████| 8/8 [00:01<00:00,  6.19it/s]


Epoch 54/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 55/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.08it/s]


Epoch 56/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.21it/s]


Epoch 57/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.18it/s]


Epoch 58/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 59/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.11it/s]


Epoch 60/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 61/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.07it/s]


Epoch 62/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.17it/s]


Epoch 63/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.12it/s]


Epoch 64/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.11it/s]


Epoch 65/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.07it/s]


Epoch 66/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.16it/s]


Epoch 67/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 68/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.03it/s]


Epoch 69/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.21it/s]


Epoch 70/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.36it/s]


Epoch 71/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  7.12it/s]


Epoch 72/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  7.00it/s]


Epoch 73/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  7.18it/s]


Epoch 74/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.48it/s]


Epoch 75/100, Loss: 9.9544


100%|██████████| 8/8 [00:01<00:00,  7.00it/s]


Epoch 76/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.78it/s]


Epoch 77/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.82it/s]


Epoch 78/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.84it/s]


Epoch 79/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.46it/s]


Epoch 80/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  7.10it/s]


Epoch 81/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.66it/s]


Epoch 82/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.26it/s]


Epoch 83/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.13it/s]


Epoch 84/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.18it/s]


Epoch 85/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.11it/s]


Epoch 86/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.13it/s]


Epoch 87/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.14it/s]


Epoch 88/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.05it/s]


Epoch 89/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.08it/s]


Epoch 90/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.05it/s]


Epoch 91/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.06it/s]


Epoch 92/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.10it/s]


Epoch 93/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.18it/s]


Epoch 94/100, Loss: 9.9549


100%|██████████| 8/8 [00:01<00:00,  6.09it/s]


Epoch 95/100, Loss: 9.9548


100%|██████████| 8/8 [00:01<00:00,  6.28it/s]


Epoch 96/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.70it/s]


Epoch 97/100, Loss: 9.9547


100%|██████████| 8/8 [00:01<00:00,  6.68it/s]


Epoch 98/100, Loss: 9.9545


100%|██████████| 8/8 [00:01<00:00,  6.74it/s]


Epoch 99/100, Loss: 9.9546


100%|██████████| 8/8 [00:01<00:00,  6.69it/s]

Epoch 100/100, Loss: 9.9546





## Run inference

In [15]:
import os

# Ensure the directory exists
output_dir = "./results/pytorch"
os.makedirs(output_dir, exist_ok=True)

output_length = 200

def run_inference(text):
    model.eval()
    input_tokens = tokenize_text([text], max_length)
    input_ids = input_tokens['input_ids'].to(device)
    attention_mask = input_tokens['attention_mask'].to(device)

    generated_tokens = []
    current_input = input_ids

    with torch.no_grad():
        for _ in range(output_length):
            logits = model(current_input)  # Shape: (batch_size, seq_len, vocab_size)
            next_token_logits = logits[:, -1, :]  # Get logits for the last token

            # Using argmax causes the model to generate the same text repeatedly
            # next_token_id = torch.argmax(next_token_logits, dim=-1).item()

            # Use multinomial sampling to generate a diverse set of outputs
            next_token_probs = torch.nn.functional.softmax(next_token_logits, dim=-1)
            next_token_id = torch.multinomial(next_token_probs, num_samples=1).item()

            if next_token_id == tokenizer.pad_token_id:  # Stop at padding token
                break

            generated_tokens.append(next_token_id)

            # Prepare input for the next iteration
            next_token = torch.tensor([[next_token_id]], device=device)
            current_input = torch.cat((current_input, next_token), dim=1)

    return tokenizer.decode(generated_tokens, skip_special_tokens=True)

In [16]:
# test_df = pd.read_csv(f'{dataset_dir}/podcast_with_summary_test.csv')
# read at the beginning of the notebook
test_df = df_test


# print some of the inference results
for i in range(5):
    # test_text = test_df['text_short'][i]
    test_text = test_df['text'][i]
    reference_summary = test_df['summary'][i]
    predicted_summary = run_inference(test_text)
    print(f"Test Text: {test_text}")
    print(f"Reference Summary: {reference_summary}")
    print(f"Predicted Summary: {predicted_summary}")
    print("")

Test Text: The following is a conversation with Andrew Ng, one of the most impactful educators, researchers, innovators, and leaders in artificial intelligence and technology space in general. He cofounded Coursera and Google Brain, launched Deep Learning AI, Landing AI, and the AI Fund, and was the chief scientist at Baidu. As a Stanford professor and with Coursera and Deep Learning AI, he has helped educate and inspire millions of students, including me. This is the Artificial Intelligence Podcast. If you enjoy it, subscribe on YouTube, give it five stars on Apple Podcast, support it on Patreon, or simply connect with me on Twitter at Lex Friedman, spelled F R I D M A N. As usual, I'll do one or two minutes of ads now and never any ads in the middle that can break the flow of the conversation. I hope that works for you and doesn't hurt the listening experience. This show is presented by Cash App, the number one finance app in the App Store. When you get it, use code LEXPODCAST. Cash 

## Generate metrics

In [17]:
# import sys
# sys.path.append('/content/drive/MyDrive/ColabNotebooks/AAI590_Capstone')

In [18]:
# from SharedUtils import evaluate_and_save_metrics
import SharedUtils
from SharedUtils import evaluate_and_save_metrics

# Evaluate the model
def evaluate_model(df, model, name):
    model.eval()
    reference_summaries = []
    predicted_summaries = []
    total_time = 0

    for _, row in df.iterrows():
        # test_text = row['text_short']
        test_text = row['text']
        reference_summary = row['summary']

        start_time = time.time()
        predicted_summary = run_inference(test_text)
        end_time = time.time()
        elapsed_time = end_time - start_time
        total_time += elapsed_time

        reference_summaries.append(reference_summary)
        predicted_summaries.append(predicted_summary)

    model_name = "pytorch"
    filename = f"{epochs}_epochs"
    rouge_results, bleu_results = evaluate_and_save_metrics(
        model_name,
        name,
        filename,
        reference_summaries,
        predicted_summaries,
        total_time
    )

    print(rouge_results)
    print(bleu_results)

    results_df = pd.DataFrame({
        'summary': reference_summaries,
        'summary_tuned': predicted_summaries
    })
    results_df.to_csv(f"./results/{model_name}/{name}/{filename}_summaries.csv")

    print(f"Evaluation completed for {name}.")
    print(f"Total time (seconds): {total_time}")
    print(f"Total time (minutes): {total_time / 60}")

In [19]:
# Evaluate on test set
evaluate_model(test_df, model, "test_dataset")

{'rouge1': 0.0005358558826444643, 'rouge2': 0.0, 'rougeL': 0.0005358558826444643, 'rougeLsum': 0.0005358558826444643}
{'bleu': 0.0, 'precisions': [0.0005307855626326964, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 8.410714285714286, 'translation_length': 11304, 'reference_length': 1344}
Evaluation completed for test_dataset.
Total time (seconds): 147.57917070388794
Total time (minutes): 2.459652845064799


In [20]:
# Evaluate on whole dataset
# whole_df = pd.read_csv(f'{dataset_dir}/podcast_with_summary.csv')
whole_df = df
evaluate_model(whole_df, model, "whole_dataset")

{'rouge1': 0.0010961623286089876, 'rouge2': 0.0, 'rougeL': 0.001096490312742265, 'rougeLsum': 0.001089408191199038}
{'bleu': 0.0, 'precisions': [0.0004978574349673725, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 8.334469472436277, 'translation_length': 56241, 'reference_length': 6748}
Evaluation completed for whole_dataset.
Total time (seconds): 745.0472040176392
Total time (minutes): 12.417453400293986
