In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Fetching data from Netflix, AMazon Prime, Hulu and Disney+ movie and TV shows datasets

In [None]:
df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')
df=pd.concat([df,pd.read_csv('/kaggle/input/hulu-movies-and-tv-shows/hulu_titles.csv')])
df=pd.concat([df,pd.read_csv('/kaggle/input/amazon-prime-movies-and-tv-shows/amazon_prime_titles.csv')])
df=pd.concat([df,pd.read_csv('/kaggle/input/disney-movies-and-tv-shows/disney_plus_titles.csv')])
df.head()

In [None]:
df.reset_index(inplace=True)

In [None]:
df

Removing null valued rows for title and description

In [None]:
drop_indices = list(np.where(df['description'].isna())[0])+list(np.where(df['title'].isna())[0])

In [None]:
drop_indices

In [None]:
df=df.drop(drop_indices)

In [None]:
df.reset_index(inplace=True)

In [None]:
df

In [None]:
df[['title','description']].info()

In [None]:
df[['title','description']].to_csv('/kaggle/working/data.csv')

### Length of inputs (description) and outputs (titles) in the dataset

In [None]:
max_length_description= max([len(desc) for desc in df['description']])
max_length_title= max([len(tit) for tit in df['title']])
avg_length_description= sum([len(desc) for desc in df['description']])/len(df['description'])
avg_length_title= sum([len(tit) for tit in df['title']])/len(df['title'])
print("Max Length of description in data ",max_length_description)
print("Max Length of title in data ",max_length_title)
print("Average Length of description in data ",avg_length_description)
print("Average Length of title in data ",avg_length_title)

In [None]:
!pip install datasets transformers rouge-score nltk

## Creating Dataset

In [None]:
import transformers
from datasets import load_dataset, load_metric

In [None]:
netflix=load_dataset("csv",data_files='/kaggle/working/data.csv')

In [None]:
netflix

In [None]:
train_test = netflix["train"].train_test_split(test_size=100)
netflix["train"] = train_test["train"]
netflix["test"] = train_test["test"]

train_validation = netflix["train"].train_test_split(test_size=2894)

netflix["train"] = train_validation["train"]
netflix["validation"] = train_validation["test"]
# netflix["test"] = train_test["test"]

netflix

In [None]:
n_train = len(netflix["train"])
n_validation = len(netflix["validation"])
n_test = len(netflix["test"])
n_total = n_train + n_validation + n_test

print(f"- Training set: {n_train*100/n_total:.2f}%")
print(f"- Validation set: {n_validation*100/n_total:.2f}%")
print(f"- Test set: {n_test*100/n_total:.2f}%")

## Data Pre-processing

In [None]:
import nltk
nltk.download('punkt')
import string
from transformers import AutoTokenizer

In [None]:
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name,use_fast=True)

In [None]:
prefix = "summarize: "

max_input_length = 500
max_target_length = 150

def clean(text):
    sentences = nltk.sent_tokenize(text.strip())
    sentences_cleaned = [s for sent in sentences for s in sent.split("\n")]
    sentences_cleaned_no_titles = [sent for sent in sentences_cleaned
                                 if len(sent) > 0 and
                                 sent[-1] in string.punctuation]
    text_cleaned = "\n".join(sentences_cleaned_no_titles)
    return text_cleaned

def preprocess(examples):
    texts_cleaned = [clean(text) for text in examples["description"]]
    inputs = [prefix + text for text in texts_cleaned]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setting the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["title"], max_length=max_target_length, 
                           truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
netflix_cleaned = netflix.filter(lambda example: (len(example['description']) <= 500) and (len(example['title']) <= 150))
tokenized = netflix_cleaned.map(preprocess, batched=True)
tokenized

In [None]:
tokenized["train"][6]

## Fine-tuning T5 for Netflix title generation

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
!rm -r {model_dir}

#### Initiating training arguments

In [None]:
batch_size = 8
my_model_name = "t5-base-title-generation"
model_dir = f"{my_model_name}"
args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard",
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
import numpy as np

metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_name)

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    # max_length=500
)

In [None]:
%reload_ext tensorboard
# %tensorboard --logdir '{model_dir}'/runs

### Training

In [None]:
trainer.train()

In [None]:
trainer.save_model()

## Load and generate

In [None]:
model_name = "t5-base-title-generation"
model_dir = f"{model_name}"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

max_input_length = 500

In [None]:
text = netflix["test"][1]['description']
print(text)

In [None]:
text = netflix["test"][78]['description']
print(text)
inputs = ["summarize: " + text]

inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, return_tensors="pt")
output = model.generate(**inputs, num_beams=8, do_sample=True, min_length=30, max_length=150)
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
predicted_title = nltk.sent_tokenize(decoded_output.strip())[0]
print("Title: ")
print(predicted_title)