In [0]:
!pip install boltons -q

In [0]:
from pathlib import Path
from textwrap import wrap


import numpy as np
import pandas as pd
from boltons.iterutils import windowed
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
from google_drive_downloader import GoogleDriveDownloader as gdd

DATA_PATH = 'data/weight_loss/articles.jsonl'
if not Path(DATA_PATH).is_file():
    gdd.download_file_from_google_drive(
        file_id='1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI',
        dest_path='data/weight_loss/weight_loss_articles.zip',
        unzip=True,
    )

Downloading 1mafPreWzE-FyLI0K-MUsXPcnUI0epIcI into data/weight_loss/weight_loss_articles.zip... Done.
Unzipping...Done.


In [0]:
def load_data(path, sequence_length=125):
    texts = pd.read_json(path).text.sample(100).str.lower().tolist()
    chars_windowed = [list(windowed(text, sequence_length)) for text in texts]
    all_chars_windowed = [sublst for lst in chars_windowed for sublst in lst]
    # filtered_good_chars = [
    #     sequence for sequence in tqdm_notebook(all_chars_windowed) 
    #     if all(char in string.printable for char in sequence)
    # ]
    return all_chars_windowed


def get_unique_chars(sequences):
    return {sublst for lst in sequences for sublst in lst}


def create_char2idx(sequences):
    unique_chars = get_unique_chars(sequences)
    return {char: idx for idx, char in enumerate(sorted(unique_chars))}


def encode_sequence(sequence, char2idx):
    return [char2idx[char] for char in sequence]


def encode_sequences(sequences, char2idx):
    return np.array([
        encode_sequence(sequence, char2idx) 
        for sequence in tqdm_notebook(sequences)
    ])


class Sequences(Dataset):
    def __init__(self, path, sequence_length=125):
        self.sequences = load_data(DATA_PATH, sequence_length=sequence_length)
        self.vocab_size = len(get_unique_chars(self.sequences))
        self.char2idx = create_char2idx(self.sequences)
        self.idx2char = {idx: char for char, idx in self.char2idx.items()}
        self.encoded = encode_sequences(self.sequences, self.char2idx)
        
    def __getitem__(self, i):
        return self.encoded[i, :-1], self.encoded[i, 1:]
    
    def __len__(self):
        return len(self.encoded)

In [19]:
dataset = Sequences(DATA_PATH, sequence_length=125)
len(dataset)
train_loader = DataLoader(dataset, batch_size=4096)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=248123.0), HTML(value='')))




In [0]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dimension=100,
        hidden_size=128, 
        n_layers=1,
        device='cpu',
    ):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        
        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,
        )
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
    def init_hidden(self, batch_size):
        return torch.randn(self.n_layers, batch_size, self.hidden_size).to(self.device)
    
    def forward(self, input_, hidden):
        encoded = self.encoder(input_)
        output, hidden = self.rnn(encoded.unsqueeze(1), hidden)
        output = self.decoder(output.squeeze(1))
        return output, hidden

In [0]:
model = RNN(vocab_size=dataset.vocab_size, device=device).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=0.001,
)

In [22]:
print(model)
print()
print('Trainable parameters:')
print('\n'.join([' * ' + x[0] for x in model.named_parameters() if x[1].requires_grad]))

RNN(
  (encoder): Embedding(60, 100)
  (rnn): GRU(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=60, bias=True)
)

Trainable parameters:
 * encoder.weight
 * rnn.weight_ih_l0
 * rnn.weight_hh_l0
 * rnn.bias_ih_l0
 * rnn.bias_hh_l0
 * decoder.weight
 * decoder.bias


In [23]:
model.train()
train_losses = []
for epoch in range(50):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, targets in progress_bar:
        batch_size = inputs.size(0)
        hidden = model.init_hidden(batch_size)

        model.zero_grad()
        
        loss = 0
        for char_idx in range(inputs.size(1)):
            output, hidden = model(inputs[:, char_idx].to(device), hidden)
            loss += criterion(output, targets[:, char_idx].to(device))

        loss.backward()

        optimizer.step()
        
        avg_loss = loss.item() / inputs.size(1)
        
        progress_bar.set_description(f'Loss: {avg_loss:.3f}')
        
        losses.append(avg_loss)
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #1	Train Loss: 2.884


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #2	Train Loss: 2.298


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #3	Train Loss: 2.121


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #4	Train Loss: 1.996


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #5	Train Loss: 1.900


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #6	Train Loss: 1.823


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #7	Train Loss: 1.762


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #8	Train Loss: 1.712


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #9	Train Loss: 1.670


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #10	Train Loss: 1.633


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #11	Train Loss: 1.601


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #12	Train Loss: 1.573


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #13	Train Loss: 1.548


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #14	Train Loss: 1.525


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #15	Train Loss: 1.505


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #16	Train Loss: 1.487


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #17	Train Loss: 1.470


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #18	Train Loss: 1.455


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #19	Train Loss: 1.441


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #20	Train Loss: 1.429


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #21	Train Loss: 1.417


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #22	Train Loss: 1.406


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #23	Train Loss: 1.395


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #24	Train Loss: 1.386


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #25	Train Loss: 1.377


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #26	Train Loss: 1.368


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #27	Train Loss: 1.360


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #28	Train Loss: 1.353


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #29	Train Loss: 1.346


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #30	Train Loss: 1.339


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #31	Train Loss: 1.333


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #32	Train Loss: 1.327


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #33	Train Loss: 1.321


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #34	Train Loss: 1.315


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #35	Train Loss: 1.310


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #36	Train Loss: 1.304


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #37	Train Loss: 1.299


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #38	Train Loss: 1.295


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #39	Train Loss: 1.290


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #40	Train Loss: 1.285


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #41	Train Loss: 1.281


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #42	Train Loss: 1.277


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #43	Train Loss: 1.273


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #44	Train Loss: 1.269


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #45	Train Loss: 1.265


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #46	Train Loss: 1.261


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #47	Train Loss: 1.258


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #48	Train Loss: 1.254


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #49	Train Loss: 1.251


HBox(children=(FloatProgress(value=0.0, max=61.0), HTML(value='')))

Epoch #50	Train Loss: 1.248


In [25]:
def pretty_print(text):
    """Wrap text for nice printing."""
    to_print = ''
    for paragraph in text.split('\n'):
        to_print += '\n'.join(wrap(paragraph))
        to_print += '\n'
    print(to_print)


temperature = 1.0

model.eval()
seed = '\n'
text = ''
with torch.no_grad():
    batch_size = 1
    hidden = model.init_hidden(batch_size)
    last_char = dataset.char2idx[seed]
    for _ in range(1000):
        output, hidden = model(torch.LongTensor([last_char]).to(device), hidden)
        
        distribution = output.squeeze().div(temperature).exp()
        guess = torch.multinomial(distribution, 1).item()
        
        last_char = guess
        text += dataset.idx2char[guess]
        
pretty_print(text)

you can make you should be success, you know the kinds of much
restually losing weight meat", are tenjoy in other suffer and means to
on your diet...
lifquestem, sleen a sucquante to lose weight.
hold-
4.. a fight. nutrition around your the down the betic and weight to
eat a lean your metabolism, choose durious nupretablion people weight
loss is to a diets that these are allow you eat restrick you'll
starvce iptull you ciluse i tentrately a great? it want of the same
weight consume learn and even the ridunt to that simply closside
distake of away because of at very usually yo diet start rulate your
adgeing your pocess. which should dry dwering you are effective to you
can also udying, any unfortual will fiber plans. yes of sumptions with
the don't being achfingers sugar in avoid niftive if you can eat all
market snack find posting the calorie diet is fory cheat as make some
of a give you aurately. having a long.
revember that maying a required for your cycle frop at your body and
moder