In [3]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
drive_path = "/content/drive/MyDrive"

Mounted at /content/drive


In [4]:
from tqdm import tqdm

In [5]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 15.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 754 kB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

In [None]:
### Prepare data
df = pd.read_csv(f"{drive_path}/filtered.csv")
df = df[df["word_count"] < 350]
df["speech"] = df["speech"].str.lower()

In [None]:
# Train test split
def split_test_set(test_set):
  test_set['True_end_speech'] = test_set['speech'].str.split().str[-200:].apply(' '.join)
  test_set['speech'] = test_set['speech'].str.split().str[:-200].apply(' '.join)
  return test_set

In [None]:
# Choose 30000 samples to train
df = df.sample(n=30000, random_state=10)

In [None]:
# #Create a very small test set to compare generated text with the reality
test_set = df
df = df.loc[~df.index.isin(test_set.index)]
test_set = split_test_set(test_set)

In [None]:
class Speech(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.speech = []

        for row in tqdm(df['speech']):
          self.speech.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.speech = self.speech[:20000]
        self.speech_count = len(self.speech)
        
    def __len__(self):
        return self.speech_count

    def __getitem__(self, item):
        return self.speech[item]

dataset = Speech(df['speech'], truncate=False, gpt2_type="gpt2")

In [None]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model1 = GPT2LMHeadModel.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [None]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [None]:
#Run the train loop and save the model to drive
model1 = train(dataset, model1, tokenizer)
torch.save(model1.state_dict(), f"{drive_path}/model_no_trunc.pt")

## After Building Model

In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.load_state_dict(torch.load(f"{drive_path}/model_no_trunc.pt"))
model.eval()

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [None]:
# New test data
df = pd.read_csv(f"{drive_path}/additional_prompts.csv")

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=100, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def speech_generation(test_data):
  generated_speech = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['prompt'].iloc[i], entry_count=1)
    generated_speech.append(x)
  return generated_speech

# Run the functions to generate the lyrics
generated_speech = speech_generation(df)

100%|██████████| 1/1 [00:44<00:00, 44.84s/it]
100%|██████████| 1/1 [00:39<00:00, 39.71s/it]
100%|██████████| 1/1 [00:20<00:00, 20.68s/it]
100%|██████████| 1/1 [00:38<00:00, 38.14s/it]
100%|██████████| 1/1 [00:38<00:00, 38.19s/it]
100%|██████████| 1/1 [00:42<00:00, 42.38s/it]
100%|██████████| 1/1 [00:08<00:00,  8.89s/it]
100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
100%|██████████| 1/1 [00:40<00:00, 40.53s/it]
100%|██████████| 1/1 [00:02<00:00,  2.15s/it]


In [None]:
df.to_csv('error_analysis.csv')

In [None]:
speeches=[x[0] for x in generated_speech]
df['speeches'] = speeches

In [None]:
df['speeches']

0    I stand here today humbled by the task before ...
1    Today I say to you that the challenges we face...
2    I want to thank my partner in this journey. Ke...
3    I welcome you all to this grand event. I have ...
4    Good morning, ladies and gentlemen. We will be...
5    My name is Mark Dorsey, and I em elated to sta...
6    I dream of a world where we have pride in our ...
7    We must be united to end racism as well as gen...
8    I want to start by congratulating Laurel on he...
9    What a day to be alive."\n\n(Representational ...
Name: speeches, dtype: object

In [None]:
generated_speech

[['I stand here today humbled by the task before us, grateful for the trust you have bestowed, and hopeful that our prayers and support may be able to keep us from entering the public domain."\n\nThe pledge to keep quiet on the steps of the courthouse buildings in downtown Columbus after the bombing has been something that has come from everything. "I am happy to join others in expressing gratitude for the prayers and support from our friends in the religious community of the downtown area," the official said. "In what they called a martyr\'s sacrifice. in what they called the "landing of the lam<|endoftext|>'],
 ['Today I say to you that the challenges we face are real.\n\n-30-\n\nThe day after the events of last week is the day the world will remember\n\n-41-\n\neverything we learned about the dark web and the internet\n\n-42-\n\nwe will honor those who have served the public for generations.\n\n-45-\n\nwe will defend our democracy and will defend freedom from terrorists.\n\n-48-\n\n