In [2]:
import numpy as np
import pandas as pd
import os, json, gc, re, random
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [4]:
import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [6]:
movies_df = pd.read_csv('./data/wiki_movie_plots_deduped.csv')
movies_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


# Data pre-processing
1. Select only american and british movies
2. Drop every column except `Title` and `Plot`
3. Rename into `input_text` and `output_text` respectivly

In [7]:
movies_df = movies_df[(movies_df["Origin/Ethnicity"]=="American") | (movies_df["Origin/Ethnicity"]=="British")]
movies_df = movies_df[["Plot", "Title"]]
movies_df.columns = ['input_text', 'target_text']
movies_df

Unnamed: 0,input_text,target_text
0,"A bartender is working at a saloon, serving dr...",Kansas Saloon Smashers
1,"The moon, painted with a smiling face hangs ov...",Love by the Light of the Moon
2,"The film, just over a minute long, is composed...",The Martyred Presidents
3,Lasting just 61 seconds and consisting of two ...,"Terrible Teddy, the Grizzly King"
4,The earliest known adaptation of the classic f...,Jack and the Beanstalk
...,...,...
21705,"In 1934, famous Belgian detective Hercule Poir...",Murder on the Orient Express
21706,"Paddington, having settled with the Brown fami...",Paddington 2
21707,‘Lady’ Sandra Abbott (Imelda Staunton) discove...,Finding Your Feet
21708,"In 1973, 16-year-old John Paul Getty III (Paul...",All the Money in the World


# Model training and evaluation

In [17]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# WandB – Import the wandb library
import wandb

In [11]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
wandb.login()

In [12]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [13]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, 
                        decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            wandb.log({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [14]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
# WandB – Initialize a new run
wandb.init(project="Movie Title Generation")
# WandB – Config is a variable that holds and saves hyperparameters and inputs
# Defining some key variables that will be used later on in the training  
config = wandb.config          # Initialize config
config.TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
config.VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
config.TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
config.SEED = 42               # random seed (default: 42)
config.MAX_LEN = 512
config.SUMMARY_LEN = 150 
# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config.SEED) # pytorch random seed
np.random.seed(config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True
# tokenzier for encoding the text
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Importing and Pre-Processing the domain data
# Selecting the needed columns only. 
# Adding the summarzie text in front of the text. This is to format the dataset similar to how T5 model was trained for summarization task. 
df = pd.read_csv('./Datasets/wiki_movie_plots_deduped.csv')
df = df[["Plot", "Title"]]
df.columns = ['ctext', 'text']
df.ctext = 'summarize: ' + df.ctext
print(df.head())

    
# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = config.SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }
val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }
# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)


    
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

# Log metrics with wandb
wandb.watch(model, log="all")
# Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(config.TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)


# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(config.VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('./data/predictions.csv')
    print('Output Files generated for review')

In [147]:
show_df = pd.DataFrame({'Plot': val_dataset.ctext, 
                        'Actual Title': actuals,
                        'Generated Title': predictions})
show_df['Plot'] = show_df['Plot'].apply(lambda plot: plot.split('summarize: ')[-1])
show_df['Generated Title'] = show_df['Generated Title'].apply(lambda title: title.capitalize())
show_df.sample(10)

Unnamed: 0,Plot,Actual Title,Generated Title
5956,"Kaakki Sattai has Murali (Kamal Haasan), a you...",Kaakki Sattai,Murali
4734,"In Hong Kong, Dennis Law, a property developer...",A Very Short Life,The interrogator
6032,Valliammai (Revathi) is a cheerful woman study...,Chinna Pulla,Valliammai
3891,"Jean Rice, a young London art teacher, travels...",The Entertainer,The sun
4058,Lucy (Collins) is working as a dancer in a sle...,I Don't Want to Be Born,Hercules
4957,"Shankarnath (Balraj Sahni), is an honest Gover...",Ghar Ghar Ki Kahani,Raja
1256,Mankind has achieved space flight capability a...,Conquest of Space,The wheel
3509,Joan (Judy Davis) is a young Australian commun...,Children of the Revolution,The straight and wide
1328,Sheriff Bull Harper (George Mathews) has captu...,The Last Wagon,The comanche
6335,The story is about of a loving pair Murali (Ja...,Mauali Krishna,Aur krishna


In [None]:
show_df.to_csv('./data/predictions.csv', index=None)

In [118]:
import re
from collections import Counter
from pprint import pprint

In [149]:
# Count words in plot
pattern = re.compile('[\w+]+')
show_df['Words in Plot'] = show_df['Plot'].apply(lambda plot: len(pattern.findall(plot)))

show_df.sample(5)

Unnamed: 0,Plot,Actual Title,Generated Title,Words in Plot
2143,The Tampico Stogies are a last-place baseball ...,Long Gone,The tampico stogies,437
3284,"NASA Space Shuttle Explorer, commanded by vete...",Gravity,The iss,684
5027,"Middle aged Nirmala Gupta (Dina Pathak), wife ...",Khubsoorat,Manju,303
2932,"Set in the country of South Africa, the story ...",Duma,Duma,722
1272,Frankie Machine (Frank Sinatra) is released fr...,The Man with the Golden Arm,Frankie machine,677


In [122]:
show_df.describe()

Unnamed: 0,Words in Plot
count,6977.0
mean,381.01247
std,338.173778
min,3.0
25%,123.0
50%,283.0
75%,586.0
max,6841.0


In [158]:
samples = show_df.loc[show_df['Words in Plot'] < 123].sample(3)
for _, info in samples.iterrows():
  pprint(f"Plot: {info['Plot']}", width=120)
  pprint(f"Actual Title: {info['Actual Title']}")
  pprint(f"Generated Title: {info['Generated Title']}")
  print()

('Plot: When a dysfunctional family gathers for Thanksgiving at their New England home, past demons reveal themselves '
 'as one son returns for the first time in three years.')
'Actual Title: The Myth of Fingerprints'
'Generated Title: The demons'

('Plot: Alvin Roberts (Lee Tracy) feuds with Bunny Harmon (Dick Powell), a singer. Roberts reports on society people '
 'who are expecting, i.e. going to have a child. One such report antagonizes a gangster in a delicate situation, who '
 'sends over a henchman to threaten him. Roberts manages to turn the tables on the gangster.')
'Actual Title: Blessed Event'
'Generated Title: Alvin roberts'

('Plot: In 1898 the US government decided to intervene on the side of the Cuban rebels in their struggle against '
 'Spanish rule. Assistant Navy Secretary Theodore Roosevelt decides to experience the war first hand by promoting and '
 'joining a volunteer cavalry regiment.\r\n'
 'The regiment, later known as the Rough Riders, brings together voluntee