In [1]:
import json


def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            try:
                json_object = json.loads(line)
                data.append(json_object)
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON line: {line.strip()}")
                continue
    return data

dataset_path = "./cleaned_transcripts/season_1_episode_1_cleaned_transcript.jsonl"
dataset = load_jsonl(dataset_path)

In [2]:
from transformers import AutoTokenizer

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

# simple prompt template
prompt_template = lambda example:  f"{example['character']}: {example['line']}"


# tokenizer
def preprocess(example):
    prompt = prompt_template(example)
    tokens = tokenizer(prompt, truncation=True, padding="max_length", max_length=256)

    # Create labels by shifting input_ids
    input_ids = tokens["input_ids"]
    labels = input_ids[1:] + [tokenizer.pad_token_id]  # Shift left and pad last token
    
    tokens["labels"] = labels  # Add labels to dataset
    return tokens

tokenized_dataset = [preprocess(v) for v in dataset]

In [13]:
tokenized_dataset_train = tokenized_dataset[:400]
tokenized_dataset_val = tokenized_dataset[400:]

In [14]:
tokenized_dataset_train[1]["input_ids"][:3]

[50, 2788, 25]

In [15]:
from transformers import AutoModelForCausalLM

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

In [23]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    push_to_hub=False
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [24]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.423284
2,No log,0.424432
3,No log,0.428447
4,No log,0.436472
5,0.301800,0.442867


TrainOutput(global_step=500, training_loss=0.3018184204101563, metrics={'train_runtime': 509.627, 'train_samples_per_second': 3.924, 'train_steps_per_second': 0.981, 'total_flos': 261292032000000.0, 'train_loss': 0.3018184204101563, 'epoch': 5.0})

In [25]:
from transformers import pipeline

# Load fine-tuned model
generator = pipeline("text-generation", model="./fine_tuned_model/checkpoint-500", tokenizer=tokenizer)

# Prompt the model
character = "Al"
prompt = f"{character}:"
output = generator(prompt, max_length=512, num_return_sequences=2)
print(output[0]["generated_text"])


Device set to use mps:0


Al:. don it to, partner
