In [310]:
import json
import numpy as np
from transformers import GPT2Tokenizer
import re
from sklearn.model_selection import train_test_split
from random import randint

In [257]:
f = open("data/StarTrek_scripts/all_scripts_raw.json")
json_file = json.load(f)
f.close()
#start with TOS: might be more manageable
TOS_scripts=json_file['TOS']
print(TOS_scripts['episode 0'][:1000])







The Star Trek Transcripts - The Cage



The
Cage
Unaired
pilot






 [Bridge]

SPOCK: Check the circuit. 
TYLER: All operating, sir. 
SPOCK: It can't be the screen then. Definitely something out there,
Captain, headed this way. 
TYLER: It could be these meteorites. 
ONE: No, it's something else. There's still something out there. 
TYLER: It's coming at the speed of light, collision course. The
meteorite beam has not deflected it, Captain.
ONE: Evasive manoeuvres, sir?
PIKE: Steady as we go.
GARISON: It's a radio wave, sir. We're passing through an old-style
distress signal.
PIKE: They were keyed to cause interference and attract attention this
way.
GARISON: A ship in trouble making a forced landing, sir. That's it. No
other message.
TYLER: I have a fix. It comes from the Talos star group.
ONE: We've no ships or Earth colonies that far out.
SPOCK: Their call letters check with a survey expedition. SS Columbia.
It disappeared in that region approximately eight


### General plan

I want to generate a star trek script. The model will be some kind of transformer. The input is a series of tokens, I'll start with 128 tokens (needs padded in case the input is shorter). The output is the next word, i.e. input: [The, quick, brown], output: [quick, brown, fox]

To make this, that means I need to:
- parse the scripts, remove line breaks and things.

    - Also need to remove episode title at beginning and copyright stuff at the end.
    - Probably should add a character for stage direction, or perhaps if I keep the colons such that kirk: is distinct from kirk.
    
- Create segments of input tokens in batches of 128

- Embed the wordings

- Split train/test data

- Create model and train
    


In [351]:
#functions to remove metadata and add stage direction tokens to script

def add_special_tokens(script):
    # Replace character names
    script = re.sub(r'\n([A-Z ]+):', r' <CHAR> \1:', script) #adds <CHAR> token any time theres a new line followed by "<CAPITALLETTERS>:"
    # You can add more substitutions here for stage directions or other special tokens
    script = re.sub(r'[\[\{]([^\]\}]+)[\]\}]', r' <LOC> \1 <LOC>', script) #add <LOC> token to indicate location
    script = re.sub(r'\(([^)]+)\)', r' <SD> \1 <SD>', script)
    return script

def remove_metadata(script):
    # Find the position of the 17th newline character
    start_pos = -1
    for _ in range(17):
        start_pos = script.find('\n', start_pos + 1)
        
    # Slice the string from the character after the 8th newline
    if start_pos != -1:
        script = script[start_pos + 1:]
    
    # Find the position of "<Back"
    pos = script.find("<Back")

    # If found, cut off everything past that point
    if pos != -1:
        script = script[:pos]
    return script

def process_names(text):
    unique_names = set()

    # Function to replace "<CHAR> NAME:" with "<CHAR> Name:"
    def char_replacer(match):
        name = match.group(1)
        if name.lower() == "mccoy": #McCoy needs special treatment due to unique capitalization
            name = "McCoy"
        else:
            name = name.capitalize()
        unique_names.add(f"{name}")
        return f"<CHAR> {name}"
    
    # Replace names after "<CHAR>"
    text = re.sub(r'<CHAR>\s+([A-Z]{2,})', char_replacer, text)
    # Function to replace all other instances of unique names
    def name_replacer(match):
        name = match.group(0)
        if name == "MCCOY":
            return " McCoy"
        return name.capitalize() if name.upper() in unique_names else name

    # Replace all other instances of unique names
    text = re.sub(r'\b[A-Z]{2,}\b', name_replacer, text)

    return text, unique_names

def preprocess_script(script):
    
    script=add_special_tokens(remove_metadata(script))
    script, names =process_names(script)
    script=script.replace('\n', ' ')
    script=script.replace('\r', ' ')
    # Replace multiple spaces with a single space
    script = re.sub(' +', ' ', script)
    script += "<END>"
    script = script.strip()
    return script, names

for i in range(10):
    # print(TOS_scripts['episode '+str(i)][:1100])
    
    script, names=preprocess_script(TOS_scripts['episode '+str(i)])
    print(script[:1000])
    print(names)
    # print('\n')
    

<LOC> Bridge <LOC> <CHAR> Spock: Check the circuit. <CHAR> Tyler: All operating, sir. <CHAR> Spock: It can't be the screen then. Definitely something out there, Captain, headed this way. <CHAR> Tyler: It could be these meteorites. <CHAR> One: No, it's something else. There's still something out there. <CHAR> Tyler: It's coming at the speed of light, collision course. The meteorite beam has not deflected it, Captain. <CHAR> One: Evasive manoeuvres, sir? <CHAR> Pike: Steady as we go. <CHAR> Garison: It's a radio wave, sir. We're passing through an old-style distress signal. <CHAR> Pike: They were keyed to cause interference and attract attention this way. <CHAR> Garison: A ship in trouble making a forced landing, sir. That's it. No other message. <CHAR> Tyler: I have a fix. It comes from the Talos star group. <CHAR> One: We've no ships or Earth colonies that far out. <CHAR> Spock: Their call letters check with a survey expedition. SS Columbia. It disappeared in that region approximately 

In [352]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
script, names=preprocess_script(TOS_scripts['episode 62'])
custom_tokens=list(names)
for token in ["<LOC>", "<CHAR>", "<SD>", "<END>"]:
    custom_tokens.append(token)
new_tokens = [token for token in custom_tokens if token not in tokenizer.get_vocab()]
print(new_tokens)
# Add the new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Print the new vocabulary size
print("Number of added tokens: ", len(new_tokens))

def check_in_vocab(word_to_check):
    word_version=[vocab_word for vocab_word in tokenizer.get_vocab() 
                  if vocab_word.lower() == word_to_check.lower() 
                  or vocab_word.lower() == ("Ġ" + word_to_check).lower()]
    if word_version:
        print(f"Versions of the word '{word_to_check}' in the vocabulary: {', '.join(word_version)}")
    else:
        print(f"The word '{word_to_check}' is not in the vocabulary.")
check_in_vocab('kirk')
# Don't forget to resize the model embeddings to match the new vocabulary size
# model.resize_token_embeddings(len(tokenizer))

['Mara', 'Sulu', 'Bald', 'McCoy', 'Uhura', 'Spock', 'Chekov', 'Klingon', 'Kang', 'Kirk', '<LOC>', '<CHAR>', '<SD>', '<END>']
Number of added tokens:  14
Versions of the word 'kirk' in the vocabulary: ĠKirk, Kirk


A problem I've discovered: I can add ĠKirk to the vocabulary, but the pretrained embeddings would prefer to use "ĠK", "irk". I might want to consider allowing the embeddings to be trainable, since GPT's embeddings have been trained on a general corpus, so they might not perfectly align with the specific nuances of my TV show scripts.

In [355]:
from collections import Counter

# Tokenize a large sample of your text
tokens = tokenizer.tokenize(script)

# Split the tokens into chunks of 128
token_chunks = [tokens[i:i + 128] for i in range(0, len(tokens), 128)]

# You may need to pad the last chunk if it's not 128 tokens long
last_chunk = token_chunks[-1]
if len(last_chunk) < 128:
    last_chunk = last_chunk + [tokenizer.pad_token] * (128 - len(last_chunk))
    token_chunks[-1] = last_chunk

print(token_chunks[1])
# # Convert chunks to input IDs
# input_ids_chunks = [tokenizer.convert_tokens_to_ids(chunk) for chunk in token_chunks]
# # Count the frequency of each token
# token_counts = Counter(tokens)
# print(script[:500])
# print([token for token, count in token_counts.items()])
# # Identify tokens that might be special
# # potential_special_tokens = [token for token, count in token_counts.items() if some_condition(token, count)]
print(script[:500])

Using pad_token, but it is not set yet.


['.', '<CHAR>', 'Kirk', ':', 'ĠAn', 'Ġentire', 'Ġhuman', 'Ġcolony', ',', 'Ġa', 'Ġwhole', 'Ġsettlement', '.', 'ĠOne', 'Ġhundred', 'Ġmen', ',', 'Ġwomen', 'Ġand', 'Ġchildren', '.', 'ĠWho', 'Ġdid', 'Ġit', '?', 'ĠAnd', 'Ġwhy', '?', '<SD>', 'commun', 'icator', 'Ġbe', 'ep', '<SD>', 'Kirk', 'here', '.', 'ĠSP', 'OCK', '<LOC>', 'OC', '<LOC>', ':', 'Spock', 'here', ',', 'ĠCaptain', '.', '<LOC>', 'Bridge', '<LOC>', '<CHAR>', 'Spock', ':', 'ĠSens', 'ors', 'Ġhave', 'Ġpicked', 'Ġup', 'Ġa', 'Klingon', 'ship', ',', 'Ġclosing', 'Ġfast', '.', 'ĠK', 'IR', 'K', '<LOC>', 'OC', '<LOC>', ':', 'ĠDef', 'lect', 'ors', 'Ġon', '.', 'ĠCondition', 'ĠRed', '.', 'ĠProtect', 'Ġyourselves', '.', '<LOC>', 'Planet', 'Ġsurface', '<LOC>', '<CHAR>', 'Kirk', ':', 'ĠTotal', 'Ġreply', 'Ġif', 'Ġattacked', '.', 'ĠSo', 'Ġthat', "'s", 'Ġthe', 'Ġanswer', '.', 'Klingon', 's', '.', '<LOC>', 'Bridge', '<LOC>', '<CHAR>', 'Sulu', ':', 'ĠTrouble', 'Ġaboard', 'Ġthe', 'Klingon', 'ship', '.', 'ĠEvidence', 'Ġof', 'Ġexplosions', ',', 'Ġmassive

#### Some thoughts on training and test
I want to tokenize all the scripts and make chunks of 128 tokens used to predict the next word. But if I take words 0:128, then 1:129, and so on, the data will be highly correlated. This means I can't just randomly take 20\% of these chunks out for test data. Instead, I'll split at the episode level. I have 80 episodes- I'll arbitrarily assign 16 episodes to test data, and set those scripts aside.

### Process scripts and chunk into training and test sets

In [356]:
scripts=[TOS_scripts['episode '+str(i)] for i in range(len(TOS_scripts))]
random_state=42
train_scripts, test_scripts = train_test_split(scripts, test_size=0.2, random_state=random_state)
train_scripts, val_scripts = train_test_split(train_scripts, test_size=1/8, random_state=random_state)  # 10% of 80% = 1/8

# Initialize a set to hold unique new tokens
unique_new_tokens = set()

# Preprocess the scripts and collect new tokens
processed_scripts = []
for script in train_scripts:
    processed_text, new_tokens = preprocess_script(script)
    processed_scripts.append(processed_text)
    unique_new_tokens.update(new_tokens)
new_tokens=list(unique_new_tokens)
for token in ["<LOC>", "<CHAR>", "<SD>", "<END>"]:
    new_tokens.append(token)
add_tokens = [token for token in new_tokens if token not in tokenizer.get_vocab()]
# Add unique new tokens to the tokenizer
tokenizer.add_tokens(list(add_tokens))

# # Tokenize all the processed scripts
tokenized_scripts = [tokenizer.tokenize(script) for script in processed_scripts]

def create_chunks(tokenized_scripts, chunk_size=129, stride=2,padded_chunks_per_script=2000):
    #create overlapping chunks of 128 tokens to predict the next word
    chunks = []

    for tokenized_script in tokenized_scripts:
        for i in range(0, len(tokenized_script) - chunk_size + 1, stride):
            chunk = tokenized_script[i:i + chunk_size]
            chunks.append(chunk)
        #The user might not provide full 128 words, so lets augment using random padded sequences.
        for _ in range(padded_chunks_per_script):
            start_index = randint(0, len(tokenized_script) - 2) # -2 to leave room for at least one token
            random_length = randint(1, chunk_size - 1) # Choose a random length less than chunk_size
            end_index = start_index + random_length
            # Select the random chunk
            chunk = tokenized_script[start_index:end_index]
            # Pad the chunk to the desired length
            padding_needed = chunk_size - len(chunk)
            pad_token = tokenizer.pad_token_id # or whatever your padding token is
            chunk += [pad_token] * padding_needed

            chunks.append(chunk)
    return chunks

train_chunks=create_chunks(tokenized_scripts)
train_X=np.array([tokenizer.convert_tokens_to_ids(chunk[:-1]) for chunk in train_chunks])
train_y=np.array([tokenizer.convert_tokens_to_ids(chunk[1:]) for chunk in train_chunks])

processed_val_scripts=[]
for script in val_scripts:
    processed_text, new_tokens = preprocess_script(script)
    processed_val_scripts.append(processed_text)
tokenized_val_scripts = [tokenizer.tokenize(script) for script in processed_val_scripts]
val_chunks=create_chunks(tokenized_val_scripts)
val_X=np.array([tokenizer.convert_tokens_to_ids(chunk[:-1]) for chunk in val_chunks])
val_y=np.array([tokenizer.convert_tokens_to_ids(chunk[1:]) for chunk in val_chunks])

processed_test_scripts=[]
for script in test_scripts:
    processed_text, new_tokens = preprocess_script(script)
    processed_test_scripts.append(processed_text)
tokenized_test_scripts = [tokenizer.tokenize(script) for script in processed_test_scripts]
test_chunks=create_chunks(tokenized_test_scripts)
test_X=np.array([tokenizer.convert_tokens_to_ids(chunk[:-1]) for chunk in test_chunks])
test_y=np.array([tokenizer.convert_tokens_to_ids(chunk[1:]) for chunk in test_chunks])
print('train shape: ',np.shape(train_X))
print('val shape: ',np.shape(val_X))
print('test shape: ',np.shape(test_X))
# # Convert to input IDs
# input_ids_chunks = [tokenizer.convert_tokens_to_ids(chunk) for chunk in input_sequences]
# target_ids_chunks = [tokenizer.convert_tokens_to_ids(chunk) for chunk in target_sequences]




train shape:  (333099, 128)
val shape:  (46081, 128)
test shape:  (92269, 128)


In [358]:
print(train_X[8])
print(train_y[8])

[3574 3264 4058 11 356 821 10868 510 257 6264 17087 6737 11 262 869 7475
 286 257 8837 543 468 587 4814 329 625 734 10675 13 7731 1194 3668 4074
 1752 12774 503 286 262 16161 355 356 14765 284 466 30 1867 3022 284 340
 503 612 30 1148 428 617 6509 484 1053 1364 2157 30 50267 33 3796 278 2119
 50267 50268 50262 25 3406 1445 11 8599 13 50268 50266 25 775 815 423
 29842 416 783 13 383 10290 531 484 1549 869 13 50268 50262 25 314 1183
 423 345 2198 76 515 534 1306 1445 13 50268 50266 25 8192 314 1683 4750
 345 711 257 845 42010 983 286 19780 11 43438 50262 30 50268 50262 25 5686]
[3264 4058 11 356 821 10868 510 257 6264 17087 6737 11 262 869 7475 286
 257 8837 543 468 587 4814 329 625 734 10675 13 7731 1194 3668 4074 1752
 12774 503 286 262 16161 355 356 14765 284 466 30 1867 3022 284 340 503
 612 30 1148 428 617 6509 484 1053 1364 2157 30 50267 33 3796 278 2119
 50267 50268 50262 25 3406 1445 11 8599 13 50268 50266 25 775 815 423
 29842 416 783 13 383 10290 531 484 1549 869 13 50268 50262