Charles Rothbaum Machine Leaning Midterm - problem 3

In [2]:
# In this problem we will create a NN to write Shakespeare
# plays.
# The training data is included in the Training Data 
# subfolder, and was taken from here:
# https://github.com/karpathy/char-rnn/blob/master/data/tinyshakespeare/input.txt
#
# The idea for the project came from this awesome blog post:
# https://karpathy.github.io/2015/05/21/rnn-effectiveness/ 
# I highly recommend you read it. It is great.

# Packages

import torch
import numpy as np
import matplotlib
from torch.utils.data import DataLoader, TensorDataset

In [3]:
### -------- Import Data and Data Preprocessing -------- ###
# you must include the appropriate data preprocessing steps

text = open('Training Data/3-RNN_input.txt', 'r').read()  # Load the dataset
characters = sorted(set(text)) # Create a list in lexi-something of every character in the data.
# Map each character to a int representing its location in the characters list3:
char_to_int = {}
for i, c in enumerate(characters):
    char_to_int[c] = i

# Convert the whole dataset into integers
numerical_text = []
for c in text:
    numerical_text.append(char_to_int[c])

# Break the whole numerical text into sequences:
seq_length = 100
num_sequences = len(numerical_text) - seq_length

#training_sequences = []
#next_chars = []

#for i in range(0, num_sequences, 1): # iterate from character 0 to 1 seq_length of the end of numerical_text. (because at that point we can no longer extract a label)
#    sequence = numerical_text[i:i + seq_length]# slice out a segment that is seq_length long (py slice dont include i+seq_length index)
#    next_char = numerical_text[i + seq_length] # the next character after the sequence, which is the label.
#    training_sequences.append((sequence, next_char))

#print(training_sequences[:5])

num_characters = len(characters)


In [4]:
# Convert numerical_text into a 3D array for input sequences where each sequence is a matrix of one-hot encoded vectors. 
# and 2D array for targets where each target is one-hot encoded vector of next char in sequence.
input_sequences = np.zeros((num_sequences, seq_length, num_characters), dtype=bool)
targets = np.zeros((num_sequences, num_characters), dtype=bool)

for i in range(num_sequences):
    for t, char in enumerate(numerical_text[i:i + seq_length]):
        input_sequences[i, t, char] = 1
    targets[i, numerical_text[i + seq_length]] = 1


# Convert to PyTorch Tensors so I can use data loading
input_sequences_tensor = torch.tensor(input_sequences, dtype=torch.float32)
targets_tensor = torch.tensor(targets, dtype=torch.float32)  
dataset = torch.utils.data.TensorDataset(input_sequences_tensor, targets_tensor)

In [5]:
#Create DataLoader with the dataset:

batch_size = 64
shuffle = True
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=shuffle)


In [6]:
### ------------ Exploratory Data Analysis ------------- ###
# Output two pieces of information that you found 
# informative as well as a print statement of why they
# assisted you in choosing your model parameters

In [13]:
### ---------------- Model Definition ------------------ ###
# Use an LSTM

class LSTM(torch.nn.Module):
    
    def __init__(self, input_size, hidden_size, num_classes):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = torch.nn.LSTM(input_size, hidden_size, batch_first=True)  # input is batch_size x seq_length x features
        self.linear = torch.nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # x: (batch_size, seq_length, input_size)
        batch_size, seq_length, _ = x.size()
        # Initialize hidden and cell state
        h0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.linear(out[:, -1, :])
        return out


In [14]:
### --------- Optimizer and Loss Definition ------------ ###
# Output a print statement supporting your optimizer and 
# loss function choices

torch.manual_seed(42)

model = LSTM(input_size=num_characters, hidden_size=50, num_classes=num_characters)
device = torch.device("cpu")
model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [15]:
### ---------------- Training pt I --------------------- ###
# Train 10 epochs
num_epochs = 10  # Define the number of epochs to train for

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0
    
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()  # reset the gradients from the last iteration
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute the loss
        loss = criterion(outputs, targets.argmax(dim=1))  # argmax is what i should use for 1-hot encoding chatgpt told me
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}')


        

Epoch 1/10, Loss: 2.148104415577688
Epoch 2/10, Loss: 1.8115809551579287


In [None]:
### ---------------- Testing pt I ---------------------- ###
# Write an essay with a minimum of 2,000 characters and 
# save the output as a PDF named "RNN_pt1.pdf"

In [None]:
### --------------- Training pt II --------------------- ###
# Train an ADDITIONAL 100 epochs

In [None]:
### --------------- Testing pt II ---------------------- ###
# Write an essay with a minimum of 2,000 characters and 
# save the output as a PDF named "RNN_pt2.pdf"

In [None]:
### -------------- Training pt III --------------------- ###
# Train until you can get it to write a good essay. Take
# advantage of the fact that pytorch doesn't reset your model
# unless you reinstantiate it in the "Model Definition" cell
#
# If after 3 hours it still doesn't make a meaningful essay
# then change some hyperparameters and try again. You can 
# look to the blog post for hyperparameter inspiration.

In [None]:
### -------------- Testing pt III ---------------------- ###
# Write an essay with a minimum of 2,000 characters and 
# save the output as a PDF named "RNN_pt3.pdf"
#
# Output a print statement commenting on wether or not you
# enjoyed this problem and why or why not.