In [None]:
from helpers import *
from model import CharRNN
from generate import *
from collections import defaultdict, Counter
from surprisal import compute_word_surprisal
from itertools import product
from train import run_char_rnn
#from ngram import 
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import requests
import re
import random

In [None]:
book_info = {}
urls = ['https://www.gutenberg.org/ebooks/863', 
        'https://www.gutenberg.org/ebooks/69087', 
        'https://www.gutenberg.org/ebooks/61262', 
        'https://www.gutenberg.org/ebooks/1155', 
        'https://www.gutenberg.org/ebooks/58866',
        'https://www.gutenberg.org/ebooks/72824',
        'https://www.gutenberg.org/ebooks/72824',
        'https://www.gutenberg.org/ebooks/61168',
        'https://www.gutenberg.org/ebooks/66446',
        'https://www.gutenberg.org/ebooks/67173',
        'https://www.gutenberg.org/ebooks/67160',
        'https://www.gutenberg.org/ebooks/65238']

In [None]:
def get_ids(urls):
    for url in urls:
        book_info[url] = []
        book_id = url.split('/')[-1]
        book_info[url].append(book_id)
        book_info[url].append(f"{book_id}.txt")

In [None]:
get_ids(urls)
book_info

In [None]:
def file_name(text):
    title = re.sub(r'[<>:"/\\|?*]', '', text)
    title = title.strip()
    title = title.replace(" ", "_")
    return title

In [None]:
def download_gutenberg_text():
    for y, x in book_info.items():
        book_id, save_path = x
        url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
        try:
            response = requests.get(url)
            response.raise_for_status()
            first_line = response.text.splitlines()[0].strip()
            if "eBook of " in first_line:
                title = first_line.split("eBook of ")[1]
            else:
                title = first_line
            title = file_name(title)
            save_path = f"{title}.txt"
            book_info[y][1] = save_path
            with open(save_path, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"Text successfully downloaded and saved to {save_path}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download text for book ID {book_id}: {e}")

In [None]:
download_gutenberg_text()

In [None]:
import glob
input_dir = "/christie" 

output_file = "christie.txt"

txt_files = glob.glob(f"{input_dir}/*.txt")

with open(output_file, "w") as outfile:
    for file in txt_files:
        with open(file, "r") as infile:
            outfile.write(infile.read())
            outfile.write("\n") 

In [None]:
from ngram_model import ngram_pipeline

file_path = "shakespeare.txt"  
ngram = ngram_pipeline(file_path, seed="Where", n=3, length=100, val_split=0.8)

# Access the results
print(f"Validation Loss (N-Gram): {ngram['validation_loss']}")
print(f"Validation Perplexity: {ngram['validation_perplexity']}")
print("Generated Text:")
print(ngram['generated_text'])

In [None]:
hyperparams = {
    "model_types": ["gru"], 
    "hidden_sizes": [128, 512],
    "n_layers": [2, 3],
    "learning_rates": [0.001, 0.01],
    "shuffle": [True, False], 
    "n_epochs": [1000, 2000],
    "chunk_len": [200],
    "batch_size": [100],
    "print_every": [100]
}

# Generate all combinations of hyperparameters
keys, values = zip(*hyperparams.items())
hyperparam_combinations = [dict(zip(keys, v)) for v in product(*values)]
results = []

# Run the model for each combination
for combo in hyperparam_combinations:
    print(f"\nRunning with hyperparameters: {combo}")
    result = run_char_rnn(
        filename="shakespeare.txt",
        model=combo["model_types"],
        n_epochs=combo["n_epochs"],
        print_every=combo["print_every"],
        hidden_size=combo["hidden_sizes"],
        n_layers=combo["n_layers"],
        learning_rate=combo["learning_rates"],
        chunk_len=combo["chunk_len"],
        batch_size=combo["batch_size"],
        shuffle=combo["shuffle"],
        cuda=True,
        validation_split=0.2
    )
    results.append(result)

In [None]:
import json

with open("data.json", "r") as file:
    results = json.load(file)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np

rainbow_cmap = mcolors.LinearSegmentedColormap.from_list("rainbow", ["red", "orange", "yellow", "green", "blue", "indigo", "violet"])
num_colors = 32
rainbow_colors = [rainbow_cmap(i) for i in np.linspace(0, 1, num_colors)]

In [None]:
print_every = 100
plt.figure(figsize=(10, 6))
for i, result in enumerate(results):
    train_losses = result['train_losses']
    epochs = list(range(1, len(train_losses) + 1))
    plt.plot(epochs, train_losses, label=f"Model {i+1} Train Loss", color=rainbow_colors[i % num_colors], alpha=0.7)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Losses Over Time for All Models")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') 
plt.grid(True)
plt.tight_layout()
plt.savefig("Training Losses Over Time for All Models.png")
plt.show()

plt.figure(figsize=(10, 6))
for i, result in enumerate(results):
    val_losses = result['val_losses']
    validation_epochs = list(range(print_every, len(result['train_losses']) + 1, print_every))
    plt.plot(validation_epochs, val_losses, label=f"Model {i+1} Val Loss", color=rainbow_colors[i % num_colors], linestyle='--', alpha=0.7)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Validation Losses Over Time for All Models")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')  
plt.grid(True)
plt.tight_layout()
plt.savefig("Validation Losses Over Time for All Models.png")
plt.show()

In [None]:
final_train_losses = [dct['final_train_loss'] for dct in results]
final_val_losses = [dct['final_val_loss'] for dct in results]

plt.figure(figsize=(8, 6))
plt.scatter(final_train_losses, final_val_losses, c='blue', alpha=0.7)
plt.xlabel("Final Train Loss")
plt.ylabel("Final Validation Loss")
plt.title("Final Train Loss vs Final Validation Loss for All Models")
plt.grid(True)

for i, (x, y) in enumerate(zip(final_train_losses, final_val_losses)):
    plt.text(x, y, f"{i+1}", fontsize=9, ha='right')

plt.tight_layout()
plt.savefig("Final Train Loss vs Final Validation Loss for All Models.png")
plt.show()

In [None]:
results_with_sums = [
    {**d, "sum_loss": d["final_train_loss"] + d["final_val_loss"]} for d in results
]
sorted_results = sorted(results_with_sums, key=lambda x: x["sum_loss"])
top_8_results_a = sorted_results[:8]
top_8_results = [{k: v for k, v in d.items() if k != "sum_loss"} for d in top_8_results_a]


In [None]:
top_8_lstm = []
for i in range(len(top_8_results)-2):
    dct = top_8_results[i+2]
    result = run_char_rnn(filename="shakespeare.txt", model='lstm', n_epochs=len(dct['train_losses']), print_every=100, hidden_size=dct["hidden_size"], n_layers=dct["n_layers"], learning_rate=dct["learning_rate"], chunk_len=200, batch_size=100, shuffle=dct["shuffle"], cuda=True, validation_split=0.2)
    top_8_lstm.append(result)


In [None]:
sixteen_gru_lstm = top_8_lstm + top_8_results
results_with_sums = [
    {**d, "sum_loss": d["final_train_loss"] + d["final_val_loss"]} for d in sixteen_gru_lstm
]
sorted_sums = sorted(results_with_sums, key=lambda x: x["sum_loss"])
top_8_results_b = sorted_sums[:8]
top_8_results_overall = [{k: v for k, v in d.items() if k != "sum_loss"} for d in top_8_results_b]

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(20, 10))  
for i, result in enumerate(top_8_results_overall):
    ax = axes[i // 4, i % 4] 
    epochs = list(range(len(result['train_losses'])))  
    val_epochs = list(range(0, len(result['train_losses']), 100))  
    ax.plot(epochs, result['train_losses'], label='Train Loss', color='blue')
    ax.plot(val_epochs, result['val_losses'], label='Val Loss', color='red', marker='o', linestyle='dashed')
    ax.set_title(f"Model {i+1}")
    ax.set_xlabel("Epochs")
    ax.set_ylabel("Loss")
    ax.legend()
plt.tight_layout()
plt.savefig("Training vs Validation Loss for Top 8 Models.png")
plt.show()

In [None]:
final_train_losses = [dct['final_train_loss'] for dct in top_8_results_overall]
final_val_losses = [dct['final_val_loss'] for dct in top_8_results_overall]

plt.figure(figsize=(8, 6))
plt.scatter(final_train_losses, final_val_losses, c='blue', alpha=0.7)
plt.xlabel("Final Train Loss")
plt.ylabel("Final Validation Loss")
plt.title("Final Train Loss vs Final Validation Loss for Top 8 Models")
plt.grid(True)

for i, (x, y) in enumerate(zip(final_train_losses, final_val_losses)):
    plt.text(x, y, f"{i+1}", fontsize=9, ha='right')

plt.tight_layout()
plt.savefig("Final Train Loss vs Final Validation Loss for Top 8 Models.png")
plt.show()

In [None]:
losses = []
for dct in top_8_results_overall:
    losses.append(dct['train_losses'])  

In [None]:
def calculate_perplexity(loss):
    loss = np.array(loss)
    perplexity = np.exp(loss)
    return perplexity

In [None]:
perplexities = []
for loss in losses:
    perplexities.append(calculate_perplexity(loss))

In [None]:
plt.figure(figsize = (8,6))
for i, result in enumerate(top_8_results_overall):
    perplexity_val = perplexities[i]
    epochs = list(range(1, len(result['train_losses']) + 1))
    plt.plot(epochs, perplexity_val, color = 'blue', alpha=0.7)
plt.xlabel("Epochs")
plt.ylabel("Perplexity")
plt.title("Perplexity over Time of Top 8 Models")
plt.tight_layout()
plt.savefig("Perplexity over Epochs.png")
plt.show()

In [None]:
checkpoint_paths = []
for results in top_8_results_overall:
    save_loc = (
        f"shakespeare_{results['model_type']}_h{results['hidden_size']}_l{results['n_layers']}"
        f"_shuf{results['shuffle']}_lr{results['learning_rate']}_e{len(results['train_losses'])}.pt"
    )
    checkpoint_paths.append(save_loc)

for path in checkpoint_paths:
    print(path)