### Project: Generating William Blake's Verses with Neural Networks

In [329]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import random

In [128]:
from typing import List, Callable, Tuple

### Data Preparation

1. Read File
2. Split lines
3. Remove undesired lines (empty lines, titles, the beginning of the file)

In [261]:
def _filter(X: List[str]) -> List[str]:
    '''
    1. Creates a list of filter functions to remove undesired strings.
    2. Apply recursively each filter in the list X of strings, emptying the list of filters.
    3. Returns a list of filtered strings.
    '''
    filters = [lambda x: x != '',
               lambda x: not x.isupper(),
               lambda x: not ('    ' in x)]
    
    def apply_filters(x: str, _filters: List[Callable]) -> str:
        return apply_filters(list(filter(_filters.pop(), x)), _filters) if len(_filters) > 1 else list(filter(_filters.pop(), x))
    
    return apply_filters(X, filters)

def replace_all(X: str, replace_list: List[str]) -> str:
    return replace_all(X.replace(replace_list.pop(), ''), replace_list) if len(replace_list) > 0 else X

def substitute(Y: List[str], X: List[str]) -> List[str]:
    return substitute(Y + [replace_all(X.pop(), ['   ', ',', '.', ':', ';', '—', '‘', '”', '’', '-', ')', '('])], X) if len(X) > 0 else Y

In [262]:
data = open("/home/roboto/Documents/GitHub/tutorials/data/WilliamBlake.txt", 'r').read()
    # 1. Get splitted lines from the original files
    # 2. Filter lines with the '_filter' function
    # 3. Remove the first 13 and last 257 which aren't verses
    # 4. Proccess each verse to remove spaces, commas, etc. with the 'substitute' function (the returned list is reversed)
data = substitute([], _filter(data.split('\n'))[13:-257])
data[:5]

['And wish to lead others when they should be led',
 'And feelthey know not what but care',
 'They stumble all night over bones of the dead',
 'How many have fallen there!',
 'Tangled roots perplex her ways']

4. Build dataset class
5. Build dataloaders

In [316]:
string_to_idx('abc')

[0, 1, 2]

In [326]:
import string
all_letters = string.ascii_letters + '“éè!? '
n_letters   = len(all_letters) + 1
char_to_id = {idx:char for char, idx in enumerate(list(all_letters))}
id_to_char = {char:idx for char, idx in enumerate(list(all_letters))}

def encode(idx: int, vector_length: int = n_letters) -> torch.Tensor:
    vec = torch.zeros(vector_length)
    vec[idx] = 1
    return vec

def char_to_vector(char: str) -> torch.Tensor:
    return encode(char_to_id[char])

def vector_to_char(encoding: torch.Tensor) -> str:
    return id_to_char[np.where(encoding == 1)[0].item()]

def string_to_vector(string: str) -> torch.Tensor:
    vector = []
    for char in list(string):
        vector.append(char_to_vector(char))
        
    return torch.stack(vector, 0)

def vector_to_string(encoding: torch.Tensor) -> str:
    '''
    Encoding(batch_size, sentence_length, n_letters)
    '''
    strings = []
    for i in range(encoding.shape[0]):
        v = encoding[i, :, :]
        strings.append(''.join([vector_to_char(v) for v in encoding.squeeze(0)]))
    
    return strings[0]
#     return strings # not working with batches yet

def string_to_idx(string: str) -> List[int]:
    string = list(string)
    indexes = []
    for char in string:
        indexes.append(char_to_id[char])
        
    return indexes

def idx_to_string(indexes: List[int]) -> str:
    return ''.join([id_to_char[idx] for idx in indexes])

In [340]:
class VersesDataset(Dataset):
    def __init__(self, verses: List[str]):
        super().__init__()
        self.samples = []
        self.generate_samples(verses)
        
    def generate_samples(self, verses: List[str]):
        for v in verses:
            _input = string_to_vector(v[:-1])
            target = string_to_idx(v[1:])
            self.samples.append((_input, target))
        
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        return self.samples[idx]
               
def build_data(data: List[str], batch_size: int = 1, test_split: float = 0.1,
               shuffle = True, seed = 42):
    dataset = VersesDataset(data)
    indexes = random.sample(list(range(len(data))), len(data)) # creates a list of 'len(verses)' randomly ordered numbers
    train_indexes, test_indexes = indexes[int(test_split*len(data)):], indexes[:int(test_split*len(data))]

    train_sampler = SubsetRandomSampler(train_indexes)
    test_sampler  = SubsetRandomSampler(test_indexes)
    
    train_loader = DataLoader(dataset, batch_size = batch_size, sampler = train_sampler)
    test_loader  = DataLoader(dataset, batch_size = batch_size, sampler = test_sampler)
    
    dataloaders = {'Train': train_loader,
                   'Test': test_loader}
    
    return dataset, dataloaders

tensor_to_int = lambda xs: [x.item() for x in xs]

In [339]:
dataset, dataloaders = build_data(data)
# It's not possible yet to build dataloaders with > 1 batches
# I'm too lazy to work on that now
train_dataloader = dataloaders['Train']
test_dataloader = dataloaders['Test']
sample = next(iter(train_dataloader))
print(sample[0])
print(tensor_to_int(sample[1]))

# TO-DO: build a method to transform 

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 1., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])
[4, 0, 19, 4, 3, 57, 8, 13, 57, 2, 14, 12, 15, 0, 13, 8, 4, 18, 57, 19, 7, 4, 24, 57, 18, 8, 19, 57, 22, 8, 19, 7, 57, 17, 0, 3, 8, 0, 13, 2, 4, 57, 0, 11, 11, 57, 19, 7, 4, 8, 17, 57, 14, 22, 13]


### A Study of Recursive Neural Networks