In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [2]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
from datasets import load_dataset

In [4]:
from deepshake import experiment, generate_lines

In [5]:
from ds_utils import show_gpu, read_poetry_into_lines

In [6]:
import re
import json
from difflib import SequenceMatcher
from collections import Counter

In [None]:
data_files = {}
data_files["train"] = "./data/shakespeare_sonnets_train.json"
data_files["validation"] = "./data/shakespeare_sonnets_eval.json"
datasets = load_dataset("json", data_files=data_files)

In [None]:
## Turn tiny shakespeare into what we want

create_data = False

if create_data:
    shakeds = load_dataset("tiny_shakespeare", split="train")
    lines = shakeds[0]['text'].split('\n')
    character = re.compile("^[a-zA-Z]*:$")
    actual_lines = [line for line in lines if not bool(character.match(line)) and len(line.strip()) > 0 and len(line.split()) > 3]
    lines_to_write = [{"text": line} for line in actual_lines]
    
    with open("./data/shakespeare_lines_train.json", "w") as fout:
        [fout.write(json.dumps(line) + "\n") for line in lines_to_write[:-1000]]

    with open("./data/shakespeare_lines_eval.json", "w") as fout:
        [fout.write(json.dumps(line) + "\n") for line in lines_to_write[-1000:]]

In [None]:
device = 'cuda'
show_gpu('Initial use:')

In [9]:
model = GPT2LMHeadModel.from_pretrained("./gpt_long_train").to(device)

NameError: name 'device' is not defined

Note: possible pretrained variants with AutoModelForSeq2Seq:
* T5
* MT5
* Bart (incl MBart
* LEDConfig
* Blenderbot
* Pegasus
* Marian

In [None]:
# model, tokenizer = load_model(AutoModelForSeq2SeqLM, AutoTokenizer, 't5_base_20k_1line', backup_tokenizer_name='t5-base', device='cuda')

In [None]:
# model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", pad_token_id=GPT2Tokenizer.eos_token_id)

In [None]:
def print_gpt_output(output_tensor):
    print("Output:\n" + 100 * '-')
    print(tokenizer.decode(output_tensor[0], skip_special_tokens=True))

In [None]:
# shake_sonnet_lines = read_poetry_into_lines('./data/shakespeare-sonnets.txt')

In [None]:
# shake_sonnet_lines[0]

In [None]:
input_ids = tokenizer.encode('Shakespeare: ', return_tensors='pt').to(device)

In [None]:
greedy_output = model.generate(input_ids, max_length=50)

In [None]:
print_gpt_output(greedy_output)

In [None]:
print_gpt_output(model.generate(input_ids, max_length=140, num_beams=5, early_stopping=True, no_repeat_ngram_size=2))
# print_gpt_output(model.generate(input_ids, max_length=140, num_beams=2))

In [None]:
print_gpt_output(model.generate(input_ids, do_sample=True, max_length=140, top_k=50))

In [None]:
print_gpt_output(model.generate(input_ids, do_sample=True, max_length=50, top_p=0.92, top_k=0))

In [None]:
output = "If thou wilt thou art belovd of many<newline>Then why dost thou use words thine own<newline>And give invention a try<newline>While I in thy verses do compile<newline>That Tome of thyself doth publish every where<newline>Thou art streaking with untouchd shame<newline>And yet love knows it is a greater grief<newline>To bear loves wrong than hates known injury<newline>Lascivious grace in whom all ill well shows<newline>Kill me with spites yet we must not be foesThose pretty wrongs that liberty commits"
as_poem = output.split("<newline>")
print("\n".join(as_poem))

In [None]:
import random

In [None]:
random.random()

In [None]:
def generate_attempted_sonnet(init_word="Shakespeare: ", max_length=140):
    input_ids = tokenizer.encode(init_word, return_tensors='pt').to(device)
    
    # human opinion indicates these args to generate are yielding best results (will do more systematic search next)
    coin_toss = random.random()
    if coin_toss > 0.5:
        output_tensor = model.generate(input_ids, max_length=140, num_beams=5, early_stopping=True, no_repeat_ngram_size=2)
    else:
        output_tensor = model.generate(input_ids, do_sample=True, max_length=140, top_k=50)
    
    output_result = tokenizer.decode(output_tensor[0], skip_special_tokens=True)
    as_poem = output_result.split("<newline>")
    print("\n".join(as_poem))

In [None]:
generate_attempted_sonnet()

## Running experiments

In [None]:
model_gen_args = {"temperature": "2", "min_length": 20, "repetition_penalty": 5. }

custom_training_args = {
    "num_train_epochs": 10,
    "adafactor": True,
    "learning_rate": 1e-4
}

In [None]:
training_pair_sizes = [20000, 40000, 80000, 200000]
lines_per_pair = [1, 2, 4, 8]
input_lengths = [5, 10, 20, 40]

name_base="experiment_run_210307"
run_experiments=False

if run_experiments:
    for pair_size in training_pair_sizes:
        for lines_pp in lines_per_pair:
            for i_length in input_lengths:
                experiment_name = f"{name_base}_{i_length}_{lines_pp}_{pair_size}"
    #             print(experiment_name)
                sonnets = experiment(
                    label=experiment_name,
                    model_base=AutoModelForSeq2SeqLM, 
                    tokenizer_base=AutoTokenizer, 
                    pretrained_name="t5-base", 
                    custom_training_args=custom_training_args,
                    lines_per_pair=lines_pp,
                    number_training_pairs=pair_size,
                    number_validation_pairs=1000,
                    input_length=i_length,
                    add_eos_token_to_labels=True, 
                    verbose=False
                )
                print(experiment_name, " done")

In [None]:
shake_sonnet_lines = read_poetry_into_lines('./data/shakespeare-sonnets.txt')
shake_sonnet_matchers = [SequenceMatcher(None, '', shake_line) for shake_line in shake_sonnet_lines]

In [None]:
def get_max_seq_match(line):
    ratios = []
    for matcher in shake_sonnet_matchers:
        matcher.set_seq1(line)
        ratios.append(matcher.ratio())
    return max(ratios)

In [None]:
experiment_results = read_poetry_into_lines('./results/written_lines/experiment_run_210307_20_2_200000')
is_poem_title = lambda line: len(line) == 1 and line[0].isnumeric()
experiment_results = [line for line in experiment_results if not is_poem_title(line)]

In [None]:
def is_plagiarized(line, threshold=0.8): # eyeballing, this seems about where it gets so similar as to be a repeat
    # simple for now (some lines are plagiarized on phrases)
    seq_matches = get_max_seq_match(line)
    return seq_matches > threshold 

In [None]:
test_line = 'And yet methinks I had astronomy'.split(' ')
# sh_line_index = shake_sonnet_lines.index(test_line)
orig_line = shake_sonnet_lines[183]
print(is_plagiarized(test_line))

In [None]:
number_and = len([line for line in experiment_results if line[0] == 'And'])
number_plagiarized = len([line for line in experiment_results if is_plagiarized(line)])
print('Number of lines generated: ', len(experiment_results))
print('Number start with and: ', number_and)
print('Number plagiarized: ', number_plagiarized)

In [None]:
def compute_and_plus_plague(experiment_base, input_length, lines_per_pair, number_pairs):
    file_name = f"./results/written_lines/{experiment_base}_{input_length}_{lines_per_pair}_{number_pairs}"
    if not isfile(file_name):
        return 0, 0, 0
    experiment_results = read_poetry_into_lines(file_name)
    is_poem_title = lambda line: len(line) == 1 and line[0].isnumeric()
    experiment_results = [line for line in experiment_results if not is_poem_title(line)]
    number_and = len([line for line in experiment_results if line[0] == 'And'])
    number_plagiarized = len([line for line in experiment_results if is_plagiarized(line)])
    return len(experiment_results), number_and, number_plagiarized

In [None]:
experiment_results = []
name_base="experiment_run_210307"
# compute_and_plus_plague(name_base, 20, 2, 200000)

for pair_size in training_pair_sizes:
    for lines_pp in lines_per_pair:
        for i_length in input_lengths:
            number_generated, number_start_with_and, number_plagiarized = compute_and_plus_plague(name_base, i_length, lines_pp, pair_size)
            experiment_results += [{
                'training_size': pair_size,
                'lines_per_pair': lines_pp,
                'input_length': i_length,
                'number_lines_gen': number_generated,
                'number_with_and': number_start_with_and,
                'number_plagiarized': number_plagiarized
            }]
            print('.', end='')

In [None]:
er_df = pd.DataFrame(experiment_results)

In [None]:
er_df = er_df[er_df.number_lines_gen > 0]

In [None]:
len(er_df)

In [None]:
er_df

In [None]:
er_df.to_csv('results/experiment_batch_210307')