In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [3]:
from deepshake import experiment

In [9]:
from ds_utils import show_gpu, read_poetry_into_lines

In [76]:
import pandas as pd
from difflib import SequenceMatcher
from os.path import isfile

In [5]:
device = 'cuda'
show_gpu('Initial use:')


Initial use: 0.0% (0 out of 16280)


Note: possible pretrained variants with AutoModelForSeq2Seq:
* T5
* MT5
* Bart (incl MBart
* LEDConfig
* Blenderbot
* Pegasus
* Marian

In [6]:
# model, tokenizer = load_model(AutoModelForSeq2SeqLM, AutoTokenizer, 't5_base_20k_1line', backup_tokenizer_name='t5-base', device='cuda')

## Running experiments

In [7]:
model_gen_args = {"temperature": "2", "min_length": 20, "repetition_penalty": 5. }

custom_training_args = {
    "num_train_epochs": 10,
    "adafactor": True,
    "learning_rate": 1e-4
}

In [8]:
training_pair_sizes = [20000, 40000, 80000, 200000]
lines_per_pair = [1, 2, 4, 8]
input_lengths = [5, 10, 20, 40]

name_base="experiment_run_210307"
run_experiments=False

if run_experiments:
    for pair_size in training_pair_sizes:
        for lines_pp in lines_per_pair:
            for i_length in input_lengths:
                experiment_name = f"{name_base}_{i_length}_{lines_pp}_{pair_size}"
    #             print(experiment_name)
                sonnets = experiment(
                    label=experiment_name,
                    model_base=AutoModelForSeq2SeqLM, 
                    tokenizer_base=AutoTokenizer, 
                    pretrained_name="t5-base", 
                    custom_training_args=custom_training_args,
                    lines_per_pair=lines_pp,
                    number_training_pairs=pair_size,
                    number_validation_pairs=1000,
                    input_length=i_length,
                    add_eos_token_to_labels=True, 
                    verbose=False
                )
                print(experiment_name, " done")

In [57]:
shake_sonnet_lines = read_poetry_into_lines('./data/shakespeare-sonnets.txt')
shake_sonnet_matchers = [SequenceMatcher(None, '', shake_line) for shake_line in shake_sonnet_lines]

In [68]:
def get_max_seq_match(line):
    ratios = []
    for matcher in shake_sonnet_matchers:
        matcher.set_seq1(line)
        ratios.append(matcher.ratio())
    return max(ratios)

In [69]:
experiment_results = read_poetry_into_lines('./results/written_lines/experiment_run_210307_20_2_200000')
is_poem_title = lambda line: len(line) == 1 and line[0].isnumeric()
experiment_results = [line for line in experiment_results if not is_poem_title(line)]

In [70]:
def is_plagiarized(line, threshold=0.8): # eyeballing, this seems about where it gets so similar as to be a repeat
    # simple for now (some lines are plagiarized on phrases)
    seq_matches = get_max_seq_match(line)
    return seq_matches > threshold 

In [71]:
test_line = 'And yet methinks I had astronomy'.split(' ')
# sh_line_index = shake_sonnet_lines.index(test_line)
orig_line = shake_sonnet_lines[183]
print(is_plagiarized(test_line))

True


In [73]:
number_and = len([line for line in experiment_results if line[0] == 'And'])
number_plagiarized = len([line for line in experiment_results if is_plagiarized(line)])
print('Number of lines generated: ', len(experiment_results))
print('Number start with and: ', number_and)
print('Number plagiarized: ', number_plagiarized)

Number of lines generated:  140
Number start with and:  134
Number plagiarized:  110


In [82]:
def compute_and_plus_plague(experiment_base, input_length, lines_per_pair, number_pairs):
    file_name = f"./results/written_lines/{experiment_base}_{input_length}_{lines_per_pair}_{number_pairs}"
    if not isfile(file_name):
        return 0, 0, 0
    experiment_results = read_poetry_into_lines(file_name)
    is_poem_title = lambda line: len(line) == 1 and line[0].isnumeric()
    experiment_results = [line for line in experiment_results if not is_poem_title(line)]
    number_and = len([line for line in experiment_results if line[0] == 'And'])
    number_plagiarized = len([line for line in experiment_results if is_plagiarized(line)])
    return len(experiment_results), number_and, number_plagiarized

In [87]:
experiment_results = []
name_base="experiment_run_210307"
# compute_and_plus_plague(name_base, 20, 2, 200000)

for pair_size in training_pair_sizes:
    for lines_pp in lines_per_pair:
        for i_length in input_lengths:
            number_generated, number_start_with_and, number_plagiarized = compute_and_plus_plague(name_base, i_length, lines_pp, pair_size)
            experiment_results += [{
                'training_size': pair_size,
                'lines_per_pair': lines_pp,
                'input_length': i_length,
                'number_lines_gen': number_generated,
                'number_with_and': number_start_with_and,
                'number_plagiarized': number_plagiarized
            }]
            print('.', end='')

................................................................

In [89]:
er_df = pd.DataFrame(experiment_results)

In [91]:
er_df = er_df[er_df.number_lines_gen > 0]

In [92]:
len(er_df)

55

In [93]:
er_df

Unnamed: 0,training_size,lines_per_pair,input_length,number_lines_gen,number_with_and,number_plagiarized
0,20000,1,5,140,113,20
1,20000,1,10,140,111,19
2,20000,1,20,140,121,27
3,20000,1,40,140,112,27
4,20000,2,5,140,114,15
5,20000,2,10,140,111,21
6,20000,2,20,140,124,23
7,20000,2,40,140,123,15
8,20000,4,5,140,118,26
9,20000,4,10,140,116,28


In [96]:
er_df.to_csv('results/experiment_batch_210307')