This notebook is written by Charles Pierse and obtained from
https://github.com/cdpierse/script_buddy_v2/blob/master/script_buddy/script_generation.ipynb
It is adapted to generate for the TV-show Friends.

In [2]:
!pip install transformers



In [3]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import numpy as np
import os
import random

In [4]:
output_dir = "./models/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [5]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') #SHOULD BE output_dir instead of gpt2-small
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

In [7]:
FILE_PATH = os.path.join("..","Data","parsedTranscripts", "allEpisodes.txt")


.


In [8]:
from language_modelling import ScriptData

In [9]:
dataset = ScriptData(tokenizer= tokenizer, file_path= FILE_PATH )
script_loader = DataLoader(dataset,batch_size=4,shuffle=True)

In [10]:
BATCH_SIZE = 1
EPOCHS = 1
LEARNING_RATE = 0.00002
WARMUP_STEPS = 10 #DOE 10000


In [11]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)
script_count = 0
sum_loss = 0.0
batch_count = 0

In [12]:
for epoch in range(EPOCHS):
    print(f"EPOCH {epoch} started" + '=' * 30)
    for idx,script in enumerate(script_loader):
        outputs = model(script.to(device), labels=script.to(device))
        
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        script_count = script_count + 1
        if script_count == BATCH_SIZE:
            script_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()
            
        if batch_count == 200:
            model.eval()
            print(f"sum loss {sum_loss}")
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 1000,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )

            print("Output:\n" + 100 * '-')
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            batch_count = 0
            sum_loss = 0.0
            model.train()



RuntimeError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 4.00 GiB total capacity; 3.02 GiB already allocated; 28.42 MiB free; 3.08 GiB reserved in total by PyTorch)

In [None]:
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)

In [None]:
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

In [None]:
input_ids = tokenizer.encode('         He kisses her softly and takes out his gun.         ', return_tensors='pt')

In [None]:
model.eval()

In [None]:
sample_outputs = model.generate(
                        input_ids= input_ids,
                        num_beams= 5,
                        max_length = 1000,
                        top_p=0.85, 
                        num_return_sequences=3
                    )

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
      print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))