# DeepShake Initial Working

In [None]:
import re

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
device = 'cpu'
line_length = 10

## Prelim: Simple finetune on small corpus

In [None]:
model_to_use = 'distilbert-base-uncased'

In [None]:
# we are just going to do sequence to sequence as a first cut: repeatedly pick a line of a sonnet at random, 
# generate some noise, and train the model to turn the noise into the line.

In [None]:
# first pass: use translator pipeline with t5-small

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")

In [None]:
inputs = tokenizer.encode("translate English to French: Shall I compare thee to a summer's day", return_tensors="pt")

In [None]:
outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)

In [None]:
print(tokenizer.decode(outputs[0]))

In [None]:
input = torch.randint(line_length, (1, 1), dtype=torch.long)

## Load lines as corpus

In [None]:
def read_poetry_into_lines(filename):
    file = open(filename, 'rt')
    text = file.read()
    file.close()
    text = re.sub(r'[^\w\s]','',text)
    lines = [line.strip().split(' ') for line in text.split('\n') if len(line.strip()) > 0 and not (line.isupper())]
#     lines = [line.]
    return lines

In [None]:
def std_feed_forward(line_number):
    input_first = tokenizer.encode("translate English to French: " + " ".join(shake_sonnet_lines[line_number]), return_tensors="pt")
    outputs = model.generate(input_first, max_length=20, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0])

In [None]:
shake_sonnet_lines = read_poetry_into_lines('../data/shakespeare-sonnets.txt')

In [None]:
shake_sonnet_lines[105]

In [None]:
std_feed_forward(105)

In [None]:
all_inputs = [f"translate English to French: {' '.join(line)}" for line in shake_sonnet_lines]

In [None]:
all_inputs[:10]

In [None]:
all_encoded = tokenizer.batch_encode_plus(all_inputs[:10], padding=True, return_tensors="pt")

In [None]:
all_encoded['input_ids'].shape

In [None]:
outputs = model.generate(all_encoded.input_ids, max_length = 30, num_beams=4, early_stopping=True)

In [None]:
tokenizer.batch_decode(outputs)

### Set up "translator" training

## Language GAN approach

Core approach: LanguageGAN, borrowing some ideas from ColdGAN and CTRLModel, in time using either an input class or a cycle loss to train it close to a certain style. Elements:

* _Generator_: Noise based transformer with max sequence length 10 (= 10 words, max length of a sonnet line, if all one syllable). Trained using REINFOCE on implicit policy to generate next token.
* _Discriminator_: Transformer architecture as well. Trained using normal way.


In [None]:
class Generator(nn.Module):
    def __init__(self, vocab_size=5092, 
                projection_size=512, 
                batch_size=512, 
                input_dropout=0.1, 
                output_dropout=0.1,
                repetition_penalty=1.2,
                temperature=0.3):
        super(Generator, self).__init__()
        self.vocab_size=vocab_size
        self.projection_size=512
        self.batch_size = batch_size
        self.repetition_penalty = repetition_penalty
        self.temperature = temperature
        
        self.input_proj = nn.Linear(self.vocab_size, self.projection_size)
        
        self.
        
    def init_weights(self):
        initrange = 0.5
        self.input_proj.weight.uniform_(-initrange, initrange)
        

In [None]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        print('Fake!')

In [None]:
gen = Generator()

In [None]:
disc = Discriminator()