In [1]:
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer, GPT2Model

from src.subtitle.SubtitleToStorage import SubtitlesToStorage

## Subtitle Download

In [3]:
subtitle_downloader = SubtitlesToStorage(
    source='youtube',
    video_id='A6F96xSoLPg', # The world's only float-through McDonalds
    storage_config={'type': 'return_value'}
)

subtitles = subtitle_downloader.save()

## Summarization Test

In [4]:
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

In [5]:
summarized_subtitles = summarizer(subtitles, max_length=200, min_length=30, do_sample=False)
summarized_subtitles

[{'summary_text': 'The world\'s only "float-through McDonald\'s" is on a canal on the Elbe River in Germany. You have to order on the smartphone app. The McBoat has been running since 2015.'}]

## Text Generation

In [9]:
prompt = 'Write a title for that text: ' + summarized_subtitles[0]['summary_text']
prompt

'Write a title for that text: The world\'s only "float-through McDonald\'s" is on a canal on the Elbe River in Germany. You have to order on the smartphone app. The McBoat has been running since 2015.'

In [24]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)

In [None]:
generator(prompt, max_length=100, num_return_sequences=1)

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")

KeyboardInterrupt: 

In [13]:
inputs = tokenizer.encode(prompt, return_tensors="pt")
outputs = model.generate(inputs, max_length=50)

tokenizer.decode(outputs[0]).replace('<pad>', '').replace('</s>', '')

" The world's only floating McDonald's has opened in Germany."

## Automation

In [4]:
from transformers import pipeline, set_seed
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from src.subtitle.SubtitleToStorage import SubtitlesToStorage
import pandas as pd
from tqdm import tqdm

summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")

KeyboardInterrupt: 

In [None]:
videos = pd.read_csv('../data/videos/tom_scott_videos.csv')
generated_titles = list()
for _, row in tqdm(videos.iterrows(), total=videos.shape[0]):
    subtitle_downloader = SubtitlesToStorage(
        source='youtube',
        video_id=row['video_id'],
        storage_config={'type': 'return_value'}
    )

    subtitles = subtitle_downloader.save()
    summarized_subtitles = summarizer(subtitles[:3000], max_length=150, min_length=30, do_sample=False)

    prompt = 'Write a title for that text: ' + summarized_subtitles[0]['summary_text']

    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=50)

    title = tokenizer.decode(outputs[0]).replace('<pad>', '').replace('</s>', '')

    print(title)

    generated_titles.append(title)

videos['t0_3B_vanilla_title_summarized'] = generated_titles
videos.to_csv('../data/generated/tom_scott_videos_t0_3B_vanilla_summarized.csv')