# Download YouTube video transcripts

Let's define a `YouTubeWrapper` class containing some functions and [static methods](https://docs.python.org/pt-br/3/library/functions.html?highlight=staticmethod#staticmethod) (`@staticmethod`):

> * `get_subtitles`: download one or multiple transcripts from YouTube videos

> * `get_video_ids`: read file containing video IDs (one per line)

> * `get_subtitle`: convert transcript to subtitle (`.srt` format)

In [None]:
import os
from datetime import timedelta
from string import punctuation

import pandas as pd

from youtube_transcript_api import YouTubeTranscriptApi

class YouTubeWrapper():
    """
    Get transcripts using youtube_transcript_api.
    """
    def get_transcripts(self, video_ids, languages=["pt", "pt-BR", "en", "en-US"]):
        transcripts = {}
        
        if type(video_ids) == str:
            video_ids = video_ids.split(",")
        
        for video_id in video_ids:
            try:
                transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
            except:
                print(f"Error for video: https://www.youtube.com/watch?v={video_id}.")
            else:
                df = pd.DataFrame(transcript)
                df.to_csv(f"transcripts/transcript_{video_id}.csv", index=False)

                with open(f"transcripts/subtitle_{video_id}.srt", "w") as f:
                    f.write(self.get_subtitle(df))

                transcripts[video_id] = df
        
        print(f"Got {len(transcripts)} subtitles.")
        return transcripts

    @staticmethod
    def get_video_ids(file_name, header=True):
        with open(file_name, "r") as f:
            video_ids = [
                line.split("/")[-1].strip()
                for i, line in enumerate(f.readlines())
                if (i > 0 or header is False)
            ]
        print("Read %s lines." % len(video_ids))
        return video_ids

    @staticmethod
    def get_subtitle(df: pd.DataFrame):
    
        def get_time(s: str):
            seconds = str(timedelta(seconds=s))
            return f"{seconds[:7]},{seconds[8:11]}"

        srt = ""
        
        df["end"] = df["start"] + df["duration"]
        df["start_time"] = df["start"].apply(get_time)
        df["end_time"] = df["end"].apply(get_time)

        count = 1
        for row in range(df.shape[0]):
            srt += f"{count}\n{df.loc[row]['start_time']} --> {df.loc[row]['end_time']}\n{df.loc[row]['text']}\n\n"
            count += 1

        return srt
    
yt = YouTubeWrapper()

if not os.path.isdir("transcripts"):
    os.mkdir("transcripts")

### Set video IDs

In [None]:
video_ids = ["dSu5sXmsur4", "BmX86O2ozdo", "Bpfw47x5a90"]

##### Alternative: read video IDs from a file

In [None]:
video_ids = yt.get_video_ids("video_ids.txt", header=False)

### Download video transcripts and save to files

In [None]:
transcripts = yt.get_transcripts(video_ids) # languages=[]

### Extra stuff

#### List available transcript languages for a YouTube video

In [None]:
for video_id in video_ids:
    languages = ["pt", "pt-BR", "en", "en-US"]
    
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        manual = transcript_list.find_manually_created_transcript(languages)
        automatic = transcript_list.find_generated_transcript(languages)

        print(f"[{video_id}]\n- Subtitles added manually: {manual}\n- Subtitles added automatically: {automatic}\n")
    except:
        print(f"[{video_id}]\n- No subtitles available in the selected languages.\n")

#### Transform all transcripts to a single data frame

In [None]:
df = pd.concat(transcripts.values()); df

#### Plot most frequent words

In [None]:
min_length = 4

df["text"] \
.apply(
    lambda x: [word.strip(punctuation) for word in x.lower().replace("\n", " ").split() if len(word) > min_length]
) \
.explode() \
.value_counts()[:50] \
.sort_values(ascending=True) \
.plot(kind="barh", figsize=(12,12), title=f"Most frequent terms (minimum length: {min_length})")