# Char-based text generation with LSTM

In [None]:
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sqlalchemy import create_engine as ce
from sqlalchemy import inspect
from pathlib import Path

# database from https://www.kaggle.com/raynardj/starter-classic-english-literature

DATA= Path('/content/drive/MyDrive/EnglishLiterature/books.db')

engine = ce('sqlite:///'+str(DATA))
inspector = inspect(engine)
inspector.get_table_names()

['authors', 'book_file', 'book_original', 'books', 'text_files']

In [None]:
books_df = pd.read_sql('books', con=engine)
authors_df = pd.read_sql('authors', con=engine)
authors_df = pd.read_sql('authors', con=engine)
text_files = pd.read_sql('text_files', con=engine)

# authors_df.death[authors_df.death < 10000].max()
# text_files.fmt.value_counts()
authors_df[authors_df.born < 10000].sort_values(by='born', ascending=False).head(30)

Unnamed: 0,index,author,born,death
82,82,F.scott Fitzgerald,1896,1940
84,84,Frances Hodgson Burnett,1894,1924
4,4,Alan Seeger,1888,1916
173,173,Katherine Mansfield,1888,1923
230,230,Rupert C.brooke,1887,1915
168,168,Joyce Kilmer,1886,19187
134,134,Hugh Lofting,1886,1947
236,236,Sara Teasdale,1884,1933
160,160,John Maynard Keynes,1883,1946
117,117,Hendrik Van Loon,1882,1944


In [None]:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Text, ForeignKey
from sqlalchemy.orm import relationship
Base = declarative_base()

def getSession(engine):
    from sqlalchemy.orm import sessionmaker
    return sessionmaker()(bind=engine)

class authorModel(Base):
    __tablename__ = 'authors'
    index = Column(Integer, primary_key = True)
    author = Column(Text)
    born = Column(Integer)
    death = Column(Integer)
    
    def __repr__(self):
        return "<Author: %s, %s to %s>"%(self.author,self.born_year,self.death_year)
    
    @property
    def born_year(self):
        if self.born < 9999: return self.born
        else: return "No Record"

    @property
    def death_year(self):
        if self.death < 9999: return self.death
        else: return "No Record"

class chapterModel(Base):
    __tablename__ = "text_files"
    index = Column(Integer, primary_key = True)
    fmt = Column(Text) # Format
    text = Column(Text) # Text content
    
    def __repr__(self):
        return "book file:%s"%(self.index)
    
class bookModel(Base):
    __tablename__ = "books"
    book_id = Column(Integer, primary_key = True)
    bookname = Column(Text)
    cate1 = Column(Text)
    author_id = Column(Integer,ForeignKey(authorModel.index))
    author = relationship(authorModel)
    
    def __repr__(self):
        return "<Book: %s>"%(self.bookname)
    
class bookChapterModel(Base):
    __tablename__ = "book_file"
    index = Column(Integer, primary_key = True)
    file_id = Column(Integer,ForeignKey(chapterModel.index))
    book_id = Column(Integer,ForeignKey(bookModel.book_id))
    file = relationship(chapterModel)
    book = relationship(bookModel)
    chapter = Column(Text())

    def __repr__(self):
        return "Book:%s with File:%s, Chapter:%s"%(self.book,self.file,self.chapter)
    
bookModel.maps = relationship(bookChapterModel)
bookModel.chapters = relationship(chapterModel, secondary = "book_file")
chapterModel.books = relationship(bookModel, secondary = "book_file")
chapterModel.maps = relationship(bookChapterModel)
authorModel.books = relationship(bookModel)

In [None]:
sess = getSession(engine)
sess.query(authorModel).all()[:5]

[<Author: Adam Ferguson, 1723 to 1816>,
 <Author: Adam Smith  , 1723 to 1790>,
 <Author: Aeschylus, 525 to No Record>,
 <Author: Aesop, No Record to No Record>,
 <Author: Alan Seeger, 1888 to 1916>]

In [None]:
def searchAuthor(kw):
    return authors_df[authors_df.author.str.contains(kw)]

def searchBookByAuthor(kw):
    author_result = list(searchAuthor(kw).index)
    return books_df[books_df.author_id.isin(author_result)]

NAME = 'Jack London'  # 'Leo Tolstoy'  # 'Zane Gray'

print(searchBookByAuthor(NAME))
print([text_files.iloc[i].fmt for i in searchBookByAuthor(NAME).book_id])

for book_id in searchBookByAuthor(NAME).book_id:
    if text_files.iloc[book_id].fmt == 'txt':
        print(len(text_files.iloc[book_id].text.split()))

      book_id  ...               cate1
53         53  ...  English Literature
114       114  ...  English Literature
126       126  ...  English Literature
277       277  ...  English Literature
281       281  ...  English Literature
336       336  ...  English Literature
353       353  ...  English Literature
376       376  ...  English Literature
629       629  ...  English Literature
638       638  ...  English Literature
655       655  ...  English Literature
709       709  ...  English Literature
763       763  ...  English Literature
774       774  ...  English Literature
814       814  ...  English Literature
816       816  ...  English Literature
868       868  ...  English Literature
878       878  ...  English Literature
929       929  ...  English Literature
942       942  ...  English Literature
957       957  ...  English Literature
986       986  ...  English Literature
1065     1065  ...  English Literature
1077     1077  ...  English Literature

[24 rows x 4 columns]
['

In [None]:

text_sample = ''
for book_id in searchBookByAuthor(NAME).book_id:
    if text_files.iloc[book_id].fmt == 'txt':
        # print(text_files.iloc[book_id].text)
        text_sample += re.sub(r'\n\s*\n', '\n', text_files.iloc[book_id].text)

text_sample = re.sub(r' +', ' ', text_sample)
print(text_sample[:1000])

# test_author = sess.query(authorModel).filter_by(index=64).first()
# test_author


 420 BC
 THE CLOUDS
 by Aristophanes
 anonymous translator
 CHARACTERS IN THE PLAY
 STREPSIADES
 PHIDIPPIDES
 SERVANT OF STREPSIADES
 DISCIPLES OF SOCRATES
 SOCRATES
 JUST DISCOURSE
 UNJUST DISCOURSE
 PASIAS, a Money-lender
 AMYNIAS, another Money-lender
 CHORUS OF CLOUDS
CLOUDS
 (SCENE:-In the background are two houses, that of Strepsiades and
 that of Socrates, the Thoughtery. The latter is small and dingy;
 the in, terior of the former is shown and two beds are seen, each
 occupied.)
 STREPSIADES (sitting up)
 GREAT gods! will these nights never end? will daylight never come?
I heard the cock crow long ago and my slaves are snoring still! Ah! Ah!
It wasn't like this formerly. Curses on the war! has it not done
me ills enough? Now I may not even chastise my own slaves. Again
there's this brave lad, who never wakes the whole long night, but,
wrapped in his five coverlets, farts away to his heart's content.
(He lies down) Come! let me nestle in well and snore too, if it be
possible...

In [None]:
len(text_sample.split())

499369

In [None]:
# TRAIN_TEXT_FILE_PATH = '/content/drive/MyDrive/EnglishLiterature/books.db'

# with open(TRAIN_TEXT_FILE_PATH) as text_file:
#     text_sample = text_file.readlines()
# text_sample = ' '.join(text_sample)

text_sample = '\n '.join(text_sample.split('\n'))

def text_to_seq(text_sample):
    char_counts = Counter(text_sample)
    char_counts = sorted(char_counts.items(), key = lambda x: x[1], reverse=True)

    sorted_chars = [char for char, _ in char_counts]
    print(sorted_chars)
    char_to_idx = {char: index for index, char in enumerate(sorted_chars)}
    idx_to_char = {v: k for k, v in char_to_idx.items()}
    sequence = np.array([char_to_idx[char] for char in text_sample])
    
    return sequence, char_to_idx, idx_to_char

sequence, char_to_idx, idx_to_char = text_to_seq(text_sample)

[' ', 'e', 't', 'a', 'o', 'n', 'i', 's', 'h', 'r', 'd', 'l', 'u', '\n', 'c', 'm', 'f', 'w', 'g', 'y', ',', 'p', 'b', '.', 'v', 'k', '"', 'I', '-', 'T', 'A', 'S', "'", ';', 'H', 'E', 'M', 'C', 'x', 'P', 'B', 'L', '!', 'O', 'W', 'D', '?', 'R', 'G', 'q', 'F', 'N', 'j', 'Y', 'z', ':', 'U', 'J', 'K', 'V', '_', '(', ')', '1', '|', 'Z', '8', '/', '0', '\\', '2', '[', ']', '3', 'X', 'Q', '7', '9', '>', '<', '4', '5', '6', '*', '=', '+', '^', '#', '&', '{', '}', '`']


In [None]:
SEQ_LEN = 256
BATCH_SIZE = 16

def get_batch(sequence):
    trains = []
    targets = []
    for _ in range(BATCH_SIZE):
        batch_start = np.random.randint(0, len(sequence) - SEQ_LEN)
        chunk = sequence[batch_start: batch_start + SEQ_LEN]
        train = torch.LongTensor(chunk[:-1]).view(-1, 1)
        target = torch.LongTensor(chunk[1:]).view(-1, 1)
        trains.append(train)
        targets.append(target)
    return torch.stack(trains, dim=0), torch.stack(targets, dim=0)

In [None]:
def evaluate(model, char_to_idx, idx_to_char, start_text=' ', prediction_len=200, temp=0.3):
    hidden = model.init_hidden()
    idx_input = [char_to_idx[char] for char in start_text]
    train = torch.LongTensor(idx_input).view(-1, 1, 1).to(device)
    predicted_text = start_text
    
    _, hidden = model(train, hidden)
        
    inp = train[-1].view(-1, 1, 1)
    
    for i in range(prediction_len):
        output, hidden = model(inp.to(device), hidden)
        output_logits = output.cpu().data.view(-1)
        p_next = F.softmax(output_logits / temp, dim=-1).detach().cpu().data.numpy()        
        top_index = np.random.choice(len(char_to_idx), p=p_next)
        inp = torch.LongTensor([top_index]).view(-1, 1, 1).to(device)
        predicted_char = idx_to_char[top_index]
        predicted_text += predicted_char
    
    return predicted_text

In [None]:
class TextRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, embedding_size, n_layers=1):
        super(TextRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, self.n_layers)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_size, self.input_size)
        
    def forward(self, x, hidden):
        x = self.encoder(x).squeeze(2)
        out, (ht1, ct1) = self.lstm(x, hidden)
        out = self.dropout(out)
        x = self.fc(out)
        return x, (ht1, ct1)
    
    def init_hidden(self, batch_size=1):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device),
               torch.zeros(self.n_layers, batch_size, self.hidden_size, requires_grad=True).to(device))

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = TextRNN(input_size=len(idx_to_char), hidden_size=128, embedding_size=128, n_layers=2)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, amsgrad=True)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    patience=5, 
    verbose=True, 
    factor=0.5
)

n_epochs = 5000
loss_avg = []

for epoch in range(n_epochs):
    model.train()
    train, target = get_batch(sequence)
    train = train.permute(1, 0, 2).to(device)
    target = target.permute(1, 0, 2).to(device)
    hidden = model.init_hidden(BATCH_SIZE)

    output, hidden = model(train, hidden)
    loss = criterion(output.permute(1, 2, 0), target.squeeze(-1).permute(1, 0))
    
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    loss_avg.append(loss.item())
    if len(loss_avg) >= 50:
        mean_loss = np.mean(loss_avg)
        print(f'Loss: {mean_loss}')
        scheduler.step(mean_loss)
        loss_avg = []
        model.eval()
        predicted_text = evaluate(model, char_to_idx, idx_to_char)
        print(predicted_text)

Loss: 2.7397478246688842
 of the the the the the the the the here the she the the of of in the and the and and and san be and on the the the mere and and the the bere the the the and of a the and the and the the and of the sar
Loss: 2.1498679971694945
 hour farcan, the the rest of the dead the more and and the sot the protien that the sone the moress of the the prother the pard a brother the to the porite of the mong of the seer the as the of the pr
Loss: 2.017552065849304
 And the see the for the pother the man the some and the and the some and the scanger of the sears of the so the beat of the song to the seppent with the seare of the seeper the stant his to the some a
Loss: 1.9384882354736328
 were consile of the men and the presting to the cander a spear the for the conce the some the poing to have the conting of the seart in the some the carred the to the concentry of the his compers of t
Loss: 1.8877447032928467
 To me and comperious the sent to the said the to more to the pool a

KeyboardInterrupt: ignored

In [None]:
model.eval()

for len_ in (500, 800, 1000, 2000, 3000, 5000, 10000, 20000):
    predicted_text = evaluate(
        model, 
        char_to_idx, 
        idx_to_char, 
        temp=0.3, 
        prediction_len=len_, 
        start_text='. '
    )
    with open('drive/MyDrive/2021coursework/bot_generated_texts/bot_london_len_{}.txt'.format(len_), 'w') as f:
        print(predicted_text, file=f)
    print(predicted_text)
    print()

. It was a considers, and all the late of the Come, the enderstanded the property of the belied of the property of the way and saw he was the little property of the compless the decried the property, and the world with the present was a man was and the sides and been not that should not a species and stander of the true into the first real did not the same the had not no property of the truth and subbit of the place of the place in the way and shall be seemed to strange and strange of the arms of 

. PHIDIPPIDES
  What she said a dispeared of the sapposes of the same and shall the bange to the same to him the compless in the person. The species of the contrative in the brother the property of the disposition of the strange the rade of the bed and spective and man the present of the property of the stream, and the spection of the stright of a man and shall go that he had been distersters to the man in the little stander to seemed the man the subject in the present of the subbit of the p

In [None]:
model_path = 'model'
torch.save(model.state_dict(), model)

AttributeError: ignored

In [None]:
# load model
the_model = TheModelClass(*args, **kwargs)
the_model.load_state_dict(torch.load(PATH))