# Code to Fine-Tune GPT2 Model
## This file aims to produce a fine-tuned GPT2 model that can predict Rotton Tomatoes Scores based on movie scripts.
### Produced by Meghan O'Keefe, Lily Scott, Daisy Li
#### * Please note that ChatGPT itself was used as help to generate the tools to produce this model.

In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments

In [2]:
# Load the CSV file
df = pd.read_csv('modified_all_rt_scores.csv')

# Process scripts (grab them from folder and pre-process them)
script_texts = []
for title in df['IMSDB_Title']:
    script_found = False
    for folder in ['Saved_Scripts_Raw/scripts-1', 'Saved_Scripts_Raw/scripts-2']:
        try:
            with open(f'{folder}/{title}.txt', 'r', encoding='utf-8') as file:
                script_texts.append(file.read())
                script_found = True
                break 
        except FileNotFoundError:
            continue
    if not script_found:
        script_texts.append(None)
        
# Eliminate those that are not matched with a script
df['Script'] = script_texts
df = df.dropna(subset=['Script'])

In [3]:
# Match the categories to class numbers so they are not in string form
category_to_label = {
    'rotten': 0,
    'fresh': 1,
    'certified fresh': 2
}
# Create a new column with the mapped data
df['CriticScoreLabel'] = df['CriticScoreCategory'].map(category_to_label)

# Split the data into training/testing
train_df, eval_df = train_test_split(df, test_size=0.25)

# Print the df to check all is in order
df


Unnamed: 0.1,Unnamed: 0,IMSDB_Title,RT_Title,CriticScore,AudienceScore,CriticScoreCategory,Script,CriticScoreLabel
1,2,alien-3,ALIEN-3,48.0,47.0,rotten,\n\t\t\tAlien III\n\n\t\tScreenplay by John Fa...,0
2,4,american-milkshake,MILKSHAKE,0.0,42.0,rotten,\n\n\n\n\n\n\n\n\n ...,0
3,6,american-werewolf-in-london,AN-AMERICAN-WEREWOLF-IN-LONDON,89.0,85.0,certified fresh,"\n\n\n\n\n""An American Werewolf in London"" -- ...",2
4,10,austin-powers---international-man-of-mystery,AUSTIN-POWERS:-INTERNATIONAL-MAN-OF-MYSTERY,73.0,77.0,fresh,\n\n\n\n\n\n\nAustin Powers: International Man...,1
5,11,austin-powers---the-spy-who-shagged-me,AUSTIN-POWERS:-THE-SPY-WHO-SHAGGED-ME,53.0,71.0,rotten,\n\n\n\n\n\n\nAUSTIN POWERS: THE SPY WHO SHAGG...,0
...,...,...,...,...,...,...,...,...
1269,1332,Yes-Man,Yes-Man,46.0,66.0,rotten,\n \n\n \n\n \n\n ...,0
1270,1333,You-Can-Count-On-Me,You-Can-Count-On-Me,95.0,88.0,certified fresh,"\n ""YOU CAN C...",2
1273,1336,Zero-Dark-Thirty,Zero-Dark-Thirty,91.0,80.0,certified fresh,\n\n\n\n\n \n\n ...,2
1274,1337,Zerophilia,Zerophilia,25.0,61.0,rotten,\n\n\n\n\n\n\n\n\n ...,0


In [4]:
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Set the pad token to be the same as the EOS token, since GPT-2 does not have a predefined pad token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the scripts for both training and evaluation sets, similar to tokenizing used in traditional_models.ipynb
train_encodings = tokenizer(train_df['Script'].tolist(), truncation=True, padding=True, max_length=1024)
eval_encodings = tokenizer(eval_df['Script'].tolist(), truncation=True, padding=True, max_length=1024)


In [5]:
# Simple class that heklps to instantiate datasets to a form that the GPT2 model can understand
class ScriptDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = ScriptDataset(train_encodings, train_df['CriticScoreLabel'].tolist())
eval_dataset = ScriptDataset(eval_encodings, eval_df['CriticScoreLabel'].tolist())


In [None]:
# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Explicitly set the padding token in line with already established
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<|endoftext|>'})
    
# Instantiate model
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)
model.resize_token_embeddings(len(tokenizer))

# Suggestion to help with processing/computing power issues
model.to('cpu')

# Define training arguments and continue as before
training_args = TrainingArguments(
    output_dir='./results',                  
    num_train_epochs=3,                      
    per_device_train_batch_size=1,           
    per_device_eval_batch_size=1,            
    gradient_accumulation_steps=4,           
    warmup_steps=500,                        
    weight_decay=0.01,                      
    logging_dir='./logs',                    
    logging_steps=10,                        
    save_strategy='epoch',                   
    evaluation_strategy='epoch',             
    load_best_model_at_end=True,             
    metric_for_best_model='accuracy',        
    greater_is_better=False,                 
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train model
trainer.train()


In [None]:
from sklearn.metrics import accuracy_score

# Specify which metric to print and provide
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

# Add to Trainer
trainer = Trainer(
    model=model,  # make sure your model is specified here
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)
eval_result = trainer.evaluate()
print(eval_result)

In [None]:
# Save the model so that it does not have to be trained again!
model.save_pretrained('./my_model_directory4')