# Pip Installs I needed to Perform

In [4]:
#!pip install transformers
#!pip install torch

# Imports

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import DistilBertTokenizer
from sklearn.preprocessing import LabelEncoder

# Training data reading and preprocessing

In [None]:
data = pd.read_csv('/kaggle/input/train-set/train.txt', sep='\t', header=None, names=['Title', 'Origin', 'Genre', 'Director', 'Plot'])
print("Genre value counts")
print(data['Genre'].value_counts())

In [4]:
def preprocess_text(text):
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

data['Clean_Plot'] = data['Plot'].apply(preprocess_text)

# BERT

In [None]:
# make sure the output is "Using device: cuda" for your own sake!!!
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

## Encode and train-test split

In [None]:
encoder = LabelEncoder()
data['GenreEncoded'] = encoder.fit_transform(data['Genre'])

X_train, X_test, y_train, y_test = train_test_split(
    data['Clean_Plot'], data['GenreEncoded'], 
    test_size=0.2, 
    random_state=42, 
    stratify=data['GenreEncoded']
)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
print("done")

In [8]:
# # Custom Dataset altered to deal with my idea for the chunking strategy, but that didn't go well -- Future Work

# class MovieGenreDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings, labels, plot_indices):
#         self.encodings = encodings
#         self.labels = labels
#         self.plot_indices = plot_indices

#     def __getitem__(self, idx):
#         item = {key: val[idx] for key, val in self.encodings.items()}
#         # No need to add labels here if using Trainer with labels in encodings
#         return item

#     def __len__(self):
#         return len(self.encodings['input_ids'])


In [None]:
# ## OLD VERSION -- simply truncates plots with more than 512 tokens, leaving them with 512 tokens

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_data(texts, max_length=512):
    return tokenizer(
       texts.tolist(),
       padding=True,
       truncation=True,
       max_length=max_length,
       return_tensors='pt'
   )

train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)


## Custom Dataset Class

In [None]:
class MovieGenreDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MovieGenreDataset(train_encodings, y_train)
test_dataset = MovieGenreDataset(test_encodings, y_test)

## Useful to see how many plots exceed 512 tokens

In [11]:
# ## TODO remove at the end (?)

# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# def count_tokens(texts, tokenizer):
#     token_counts = []
#     for text in texts:
#         tokens = tokenizer.encode(text, truncation=False)  # No truncation to check full token count
#         token_counts.append(len(tokens))
#     return token_counts

# token_counts = count_tokens(data['Clean_Plot'].tolist(), tokenizer)
# over_512_count = sum([1 for count in token_counts if count > 512])

# print(f"Number of plots exceeding 512 tokens: {over_512_count}")

In [12]:
# # NEW VERSION OF ENCODINGS THAT MERGE CHUNKS

# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# def chunk_text(text, max_length=512):
#     words = text.split()
#     return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

# def tokenize_chunked_data(texts, labels, max_length=512):
#     all_chunks = []
#     all_labels = []
#     for i, text in enumerate(texts):
#         chunks = chunk_text(text, max_length)
#         all_chunks.extend(chunks)
#         if labels is not None:
#             all_labels.extend([labels[i]] * len(chunks))
#     encodings = tokenizer(all_chunks, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
#     return encodings, all_labels

# train_encodings, train_labels = tokenize_chunked_data(X_train, y_train)
# test_encodings, test_labels = tokenize_chunked_data(X_test, y_test)

In [13]:
# Experimental version where I tried to actually implement the version that I had envisioned but this didn't go well at all -- Future Work

# def chunk_text(text, tokenizer, max_length=512, stride=0):
#     tokens = tokenizer.encode(text, add_special_tokens=False)
#     # Split tokens into chunks with optional stride
#     chunks = []
#     for i in range(0, len(tokens), max_length - stride):
#         chunk = tokens[i:i + max_length]
#         chunks.append(chunk)
#     return chunks


# def tokenize_chunked_data(texts, labels, tokenizer, max_length=512, stride=0):
#     all_input_ids = []
#     all_attention_masks = []
#     all_labels = []
#     for i, text in enumerate(texts):
#         chunks = chunk_text(text, tokenizer, max_length=max_length, stride=stride)
#         for chunk in chunks:
#             # Prepare inputs for the model
#             inputs = tokenizer.prepare_for_model(
#                 chunk,
#                 max_length=max_length,
#                 padding='max_length',
#                 truncation=True,
#                 return_tensors='pt'
#             )
#             all_input_ids.append(inputs['input_ids'])
#             all_attention_masks.append(inputs['attention_mask'])
#             if labels is not None:
#                 all_labels.append(labels[i])
#     # torch.cat concatenates tensors
#     all_input_ids = torch.cat(all_input_ids, dim=0)
#     all_attention_masks = torch.cat(all_attention_masks, dim=0)
#     return {'input_ids': all_input_ids, 'attention_mask': all_attention_masks}, all_labels


In [14]:
# print(len(train_encodings['input_ids']), len(train_labels))
# print(len(test_encodings['input_ids']), len(test_labels))

# assert len(train_encodings['input_ids']) == len(train_labels), "Mismatch between encodings and labels"
# assert len(test_encodings['input_ids']) == len(test_labels), "Mismatch between encodings and labels"

In [15]:
# input_ids = train_encodings['input_ids'][0].tolist()
# tokens = tokenizer.convert_ids_to_tokens(input_ids)
# print(tokens)

## Import Pre-Trained Model

In [None]:
from transformers import DistilBertForSequenceClassification
model = (DistilBertForSequenceClassification
        .from_pretrained('distilbert-base-uncased', num_labels=len(encoder.classes_))
        .to(device))

## Custom Metrics function

In [18]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    print(f"Accuracy: {acc}, F1-score: {f1}")
    return {'accuracy': acc, 'f1': f1}

## Training (Fine-Tuning)

In [None]:
from transformers import Trainer, TrainingArguments

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3, # we tested with more than 3 epochs but it didn't improve the results
    weight_decay=0.01,               
    logging_steps=100,
    logging_dir='./logs',
    report_to="none" # supress annoying warning
)

# parameters for training (fine-tuning)
trainer = Trainer(
    model=model, # pre-trained distilBert
    args=training_args, # custom training args
    train_dataset=train_dataset, # custom datasets
    eval_dataset=test_dataset, 
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

# actual fine-tuning of the model we feed it
trainer.train()

## Saves the model

In [None]:
model.save_pretrained('./fineTunedDistilbertWithWeirdChunking')
tokenizer.save_pretrained('./fineTunedDistilbertWithWeirdChunking')

## Evaluation Metrics (Accuracy, F1)

In [None]:
eval_result = trainer.evaluate()
predictions_output = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions_output.predictions, axis=-1)
true_labels = y_test.values
print(f"Evaluation Results: {eval_result}")

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

confusion_matrix = confusion_matrix(true_labels, predicted_labels)
display = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=encoder.classes_)
display.plot(cmap="Blues", values_format="d")
plt.title("Confusion Matrix")
plt.show()

## Manually Analyze Misclassifications

In [None]:
misclassified_indices = np.where(predicted_labels != true_labels)[0]

# # displays first 10 misclassified examples to analyze in the console
# for idx in misclassified_indices[:10]:
#     print(f"True label: {encoder.inverse_transform([true_labels[idx]])[0]}, Predicted: {encoder.inverse_transform([predicted_labels[idx]])[0]}")
#     print(f"Text: {X_test.iloc[idx]}")
#     print("-" * 50)

# saves mispredictions to a file to manually analyze them for the report
with open('./misclassified_plots.txt', 'w') as f:
    f.write("Misclassified Plots:\n")
    f.write("="*80 + "\n")
    for idx in misclassified_indices:
        true_label = encoder.inverse_transform([true_labels[idx]])[0]
        predicted_label = encoder.inverse_transform([predicted_labels[idx]])[0]
        plot_text = X_test.iloc[idx]
        f.write(f"True label: {true_label}, Predicted: {predicted_label}\n")
        f.write(f"Plot: {plot_text}\n")
        f.write("-" * 80 + "\n")

print("Done Writing")

In [None]:
#print(misclassified_indices.shape, X_test.shape, 1 - len(misclassified_indices)/len(X_test))

# Testing Time (Run the Import cell at the top first)

It's essentially the same as above but we use the whole plot instead of performing a split, but I want to make the whole process clear

## Read and Preprocess the Training Dataset

In [10]:
# train_data = pd.read_csv('/kaggle/input/train-set/train.txt', sep='\t', header=None, names=['Title', 'Origin', 'Genre', 'Director', 'Plot'])

# def preprocess_text(text):
#     text = text.strip()
#     text = re.sub('\s+', ' ', text)
#     return text

# train_data['Clean_Plot'] = train_data['Plot'].apply(preprocess_text)

test_data = pd.read_csv('/kaggle/input/test-set/test_no_labels.txt', 
                        sep='\t', header=None, names=['Title', 'Origin', 'Director', 'Plot'])

test_data.head()

In [None]:
# train_data.head()

test_data['Clean_Plot'] = test_data['Plot'].apply(preprocess_text)

test_encodings = tokenize_data(test_data['Clean_Plot'])

## X_train and y_train on the whole plot

In [12]:
# encoder = LabelEncoder()
# train_data['GenreEncoded'] = encoder.fit_transform(train_data['Genre'])

# X_train, y_train = train_data['Clean_Plot'], train_data['GenreEncoded']

## Tokenize the training data and import model

In [None]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# print(f"Using device: {device}")

In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(encoder.classes_)
).to(device)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_data(texts, max_length=512):
    return tokenizer(
       texts.tolist(),
       padding=True,
       truncation=True,
       max_length=max_length,
       return_tensors='pt'
   )

train_encodings = tokenize_data(X_train)

In [None]:
X_train.shape

## Encode the dataset

In [15]:
class MovieGenreDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)
    
def tokenize_data(texts, max_length=512):
    return tokenizer(
       texts.tolist(),
       padding=True,
       truncation=True,
       max_length=max_length,
       return_tensors='pt'
   )

train_dataset = MovieGenreDataset(train_encodings, y_train)

## Training on the whole plot

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(encoder.classes_)).to(device)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./model_testing',
    eval_strategy="no", # don't forget to not evaluate
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    logging_dir='./logs',
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()

## Import the testing data, preprocess and tokenize it

In [None]:
test_data = pd.read_csv('/kaggle/input/test-set/test_no_labels.txt', 
                        sep='\t', header=None, names=['Title', 'Origin', 'Director', 'Plot'])

test_data.head()

In [29]:
test_data['Clean_Plot'] = test_data['Plot'].apply(preprocess_text)

test_encodings = tokenize_data(test_data['Clean_Plot'])

## Custom Testing Dataset

## Load the Model and Tokenizer

In [31]:
class MovieGenreTestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


test_dataset = MovieGenreTestDataset(test_encodings)

## Perform the predictions

In [None]:
model.eval() # puts the model in evaluation mode, instead of training

predictions_output = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions_output.predictions, axis=-1)

predicted_genres = encoder.inverse_transform(predicted_labels)

## Writes them to a File

In [33]:
assert len(predicted_genres) == len(test_data), "Number of predictions does not match number of plots"

with open('predicted_genres.txt', 'w', encoding='utf-8') as f:
    for genre in predicted_genres:
        f.write(f"{genre}\n")

# don't forget to manually remove the last \n !!!!

In [28]:
# import os
# print(os.listdir('./saved_model_0.699'))

In [29]:
# from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# # Load the saved model and tokenizer from the directory
# model = DistilBertForSequenceClassification.from_pretrained('./saved_model_0.699', from_tf=False)
# tokenizer = DistilBertTokenizer.from_pretrained('./saved_model_0.699', from_tf=False)

## this should be at the end