In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from src.subtitle.SubtitleToStorage import SubtitlesToStorage
import pandas as pd
from tqdm import tqdm

In [2]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large-cnn')

In [6]:
videos = pd.read_csv('../data/videos/tom_scott_videos.csv')
videos

Unnamed: 0,title,video_id
0,Reopening an airport terminal is harder than y...,JA3RXeds0_g
1,14 science fiction stories in under 6 minutes,iQGl-ffVtaM
2,"After 140 years, this old technology still kee...",mzAfich6mow
3,"The Elie Chainwalk is safe, as long as you fol...",ELGWbY4xBDs
4,This town forgot to be a city,kBaLb1C4WAg
...,...,...
236,Why California's Musical Road Sounds Terrible,Ef93WmlEho0
237,The Reaction Ferries of Basel: What Have We Mi...,b6utGZQ9Sks
238,The Centuries-Old Debt That's Still Paying Int...,cfSIC8jwbQs
239,Rotary Jails and Accidental Amputations,-DGXHMOhXAw


In [9]:
summaries = list()
for _, row in tqdm(videos.iterrows(), total=videos.shape[0]):
    subtitle_downloader = SubtitlesToStorage(
        source='youtube',
        video_id=row['video_id'],
        storage_config={'type': 'return_value'}
    )

    subtitles = subtitle_downloader.save()
    batch = tokenizer(subtitles, truncation=True, max_length=1024, return_tensors='pt')
    generated_ids = model.generate(batch["input_ids"], min_length=256, max_length=512)
    summary = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    summaries.append({'title': row['title'],  'summary': summary[0]})

100%|██████████| 241/241 [4:32:27<00:00, 67.83s/it]   


In [32]:
summaries[112]

{'title': 'How To Grow A Martian Salad On Earth',
 'summary': "CNN's John Defterios tries to grow a salad in Martian soil. Martian soil contains all of the nutrientsneeded for a plant to grow. The Martian surface is covered in soil with a key ingredient of basalt, a dark, iron-rich volcanic rock. The original Martian soil simulant was created from weathered volcanic ash from near Mauna Kea in Hawaii. But this stuff, MMS, uses basalt rock found in an ancient volcano in the western Mojave Desert, which has a very similar mineral makeup to what's found on Mars. It is missing an important component of Martian soil, though, which is toxic perchlorate compounds. These could be lethal to crops on Mars, and to the humans that eat those crops. But luckily for us there are a range of different methodsariefor getting rid of them. It's really just a waiting game now, even if these things grow, they wouldn't really be keeping me full. If we're going to stay on Mars for extended periods of time, we 

In [23]:
df = pd.DataFrame(summaries)
df.to_csv('title_summary.csv', index=True, index_label='id', header=True)

In [24]:
test = pd.read_csv('title_summary.csv')
test

Unnamed: 0,id,title,summary
0,0,Reopening an airport terminal is harder than y...,The South Terminal at London's Gatwick Airport...
1,1,14 science fiction stories in under 6 minutes,I used to make short science fictionvideos for...
2,2,"After 140 years, this old technology still kee...","140 years ago, the Callander and Oban railway ..."
3,3,"The Elie Chainwalk is safe, as long as you fol...","The Elie Chainwalk is 500 metres long, made up..."
4,4,This town forgot to be a city,Rochester had been a city since the 13th centu...
...,...,...,...
236,236,Why California's Musical Road Sounds Terrible,If you drive down a certain stretch of highway...
237,237,The Reaction Ferries of Basel: What Have We Mi...,"CNN's John Sutter travels to Basel, Switzerlan..."
238,238,The Centuries-Old Debt That's Still Paying Int...,"In 1648, the Water Board of Lekdijk Bovendams ..."
239,239,Rotary Jails and Accidental Amputations,In 1881 an architect called William Brown and ...
