<a href="https://colab.research.google.com/github/mohanpartha/ML_preprocessing/blob/master/Great_expect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# Great Expectation book
import requests
content = requests.get("https://www.gutenberg.org/files/1400/1400-0.txt").text
with open("great_expectation.txt", "w", encoding="utf-8") as f:
  f.write(content)

In [28]:
print(content[:1000])

ï»¿The Project Gutenberg EBook of Great Expectations, by Charles Dickens

This eBook is for the use of anyone anywhere in the United States and most
other parts of the world at no cost and with almost no restrictions
whatsoever.  You may copy it, give it away or re-use it under the terms of
the Project Gutenberg License included with this eBook or online at
www.gutenberg.org.  If you are not located in the United States, you'll have
to check the laws of the country where you are located before using this ebook.

Title: Great Expectations

Author: Charles Dickens

Release Date: July, 1998 [EBook #1400]
[Most recently updated: April 27, 2020]

Language: English

Character set encoding: UTF-8

*** START OF THIS PROJECT GUTENBERG EBOOK GREAT EXPECTATIONS ***




Produced by An Anonymous Volunteer, and David Widger

[Illustration]




Great Expectations

[1867 Edition]

by Charles Dickens


Contents

 Chapter I.
 Chapter II.
 Chapter III.
 Chapter

In [29]:
from string import punctuation

import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F

In [30]:
sequence_length = 100
batch_size = 128

# dataset file path
FILE_PATH = "great_expectation.txt"

# read the data
text = open(FILE_PATH, encoding="utf-8").read()

# remove caps, comment this code if you want uppercase characters as well
text = text.lower()
# remove punctuation
text = text.translate(str.maketrans("", "", punctuation))

In [31]:
text



In [32]:
# print some stats
n_chars = len(text)
vocab = ''.join(sorted(set(text)))
print("unique_chars:", vocab)
n_unique_chars = len(vocab)
print("Number of characters:", n_chars)
print("Number of unique characters:", n_unique_chars)

unique_chars: 	
 0123456789abcdefghijklmnopqrstuvwxyzª´»¿âãï
Number of characters: 1005708
Number of unique characters: 52


In [33]:
print('\ufeff')

﻿


In [34]:
# print some stats
n_chars = len(text)
vocab = ''.join(sorted(set(text)))
print("unique_chars:", vocab)
n_unique_chars = len(vocab)
print("Number of characters:", n_chars)
print("Number of unique characters:", n_unique_chars)

unique_chars: 	
 0123456789abcdefghijklmnopqrstuvwxyzª´»¿âãï
Number of characters: 1005708
Number of unique characters: 52


In [35]:
vocab

'\t\n 0123456789abcdefghijklmnopqrstuvwxyz\x80\x94\x98\x99\x9c\x9dª´»¿âãï'

In [36]:
# dictionary that converts characters to integers
char2int = {c: i for i, c in enumerate(vocab)}
# dictionary that converts integers to characters
int2char = {i: c for i, c in enumerate(vocab)}

In [37]:
# convert all text into integers
encoded_text = np.array([char2int[c] for c in text])

In [38]:
# tokenize  input_seq, target_seq
def create_sequence_data(text, sequence_length):
  input_seq = []
  target_seq = []
  for idx in range(0, len(text), sequence_length):
    st_idx = idx
    end_idx = st_idx + sequence_length + 1
    if end_idx > len(text):
      # Exclude last slice that may not be full
      continue
    input_seq.append(text[st_idx:end_idx-1])
    target_seq.append(text[st_idx+1:end_idx])
  return input_seq, target_seq

In [39]:
input_seq, target_seq = create_sequence_data(encoded_text, sequence_length=sequence_length)

In [40]:
train_seq, valid_seq, train_targets, valid_targets = train_test_split(input_seq, target_seq, test_size=0.3)

In [41]:
def one_hot_encode(sequence, vocab_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    # (Sequence Length, One-Hot Encoding Size)
    seq_length = len(sequence)
    output = np.zeros((seq_length, vocab_size), dtype=np.float32)

    for seq in range(seq_length):
      output[seq, sequence[seq]] = 1

    return output
    

In [42]:
class TextGenDataset(Dataset):
  def __init__(self, text_seq, target_seq, seq_length, vocab_size):
    super().__init__()
    self.text_seq = text_seq
    self.target_seq = target_seq
    self.seq_length = seq_length
    self.vocab_size = vocab_size

  def __getitem__(self, idx):
    return one_hot_encode(self.text_seq[idx], self.vocab_size), self.target_seq[idx]

  def __len__(self):
    return len(self.text_seq)

train_ds = TextGenDataset(train_seq, train_targets, sequence_length, len(vocab))
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=False, drop_last=True)

valid_ds = TextGenDataset(valid_seq, valid_targets, sequence_length, len(vocab))
valid_dl = DataLoader(valid_ds, batch_size=batch_size, drop_last=True)

In [43]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.GRU(input_size, hidden_dim, n_layers, batch_first=True)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x, hidden=None):
        
        batch_size = x.size(0)
        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        return out, hidden.detach()

In [44]:
# Instantiate the model with hyperparameters
model = Model(input_size=len(vocab), output_size=len(vocab), hidden_dim=512, n_layers=3)

# Define hyperparameters
n_epochs = 100
lr=0.001
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [45]:
# Training Run
def train(model, train_dl, criterion, optimizer, batch_size, device=None, n_epochs=100):
  if device is None:
    device = "cuda" if torch.cuda.is_available() else "cpu"
  model = model.to(device)
  model.train()
  criterion = criterion.to(device)
  for epoch in range(1, n_epochs + 1):
    hidden = None
    n_batches = 0
    losses = 0
    for input_seq, target_seq in train_dl:
      input_seq, target_seq = input_seq.to(device), target_seq.to(device)
      optimizer.zero_grad() # Clears existing gradients from previous epoch
      output, hidden = model(input_seq, hidden)
      loss = criterion(output, target_seq.view(-1).long())
      loss.backward() # Does backpropagation and calculates gradients
      optimizer.step() # Updates the weights accordingly
      # hidden.detach_()
      losses += loss.item()
      n_batches += 1
    
    if epoch%10 == 0:
      print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
      print("Loss: {:.4f}".format(losses / n_batches))
  return model

In [46]:
model = train(model, train_dl, criterion, optimizer, batch_size, device, n_epochs=100)

Epoch: 10/100............. Loss: 1.3457
Epoch: 20/100............. Loss: 1.1505
Epoch: 30/100............. Loss: 1.0339
Epoch: 40/100............. Loss: 0.9424
Epoch: 50/100............. Loss: 0.8717
Epoch: 60/100............. Loss: 0.7699
Epoch: 70/100............. Loss: 0.6867
Epoch: 80/100............. Loss: 0.6294
Epoch: 90/100............. Loss: 0.5676
Epoch: 100/100............. Loss: 0.5363


In [47]:
model.load_state_dict(torch.load("model.pt"))

FileNotFoundError: ignored

In [48]:
def predict(model, hidden, character, char2int, int2char, device):  
    # One-hot encoding our input to fit into the model
    # print(character)
    character = np.array([char2int[c] for c in character])
    # print(character)
    character = one_hot_encode(character, vocab_size=len(char2int))
    # print(character.shape)
    character = torch.from_numpy(character).unsqueeze(0).to(device)
    with torch.no_grad():
      out, hidden = model(character)
    # print(hidden.size())
    prob = nn.functional.softmax(out[-1], dim=0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim=0)[1].item()

    return int2char[char_ind], hidden

In [49]:
def sample(model, char2int, int2char, out_len, start='hey', device=None):
    model.eval() # eval mode
    if device is None:
      device = "cuda" if torch.cuda.is_available() else "cpu"    
    model.to(device)
    start = start.lower()
    # First off, run through the starting characters
    chars = [ch for ch in start]
    size = out_len - len(chars)
    # Now pass in the previous characters and get a new one
    hidden = None
    for ii in range(size):
        # print(chars)
        char, hidden = predict(model, hidden, chars, char2int, int2char, device)
        chars.append(char)

    return ''.join(chars)

In [52]:
sample(model, char2int, int2char, out_len=150, start="my fathers family ")

'my fathers family i common and i saw\nher cannot dress myself and smistened and when i came to mine myself when\ni saw him the land was to be apprentice'