In [9]:
import nltk
import string
import os
import re
import webvtt
import numpy as np
import pandas as pd
from glob import glob
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# ref: https://shkspr.mobi/blog/2018/09/convert-webvtt-to-a-transcript-using-python/
# youtube-dl --write-auto-sub --sub-format vtt --skip-download https://www.youtube.com/user/yuttadhammo
def vtt_to_txt(file):
    vtt = webvtt.read(file)
    transcript = ""

    lines = []
    for line in vtt:
        lines.extend(line.text.strip().splitlines())

    previous = None
    for line in lines:
        if line == previous:
           continue
        transcript += " " + line
        previous = line

    return transcript

In [11]:
SUBTITLE_DIR = '../subtitle-diff/' # directory with .vtt files

In [13]:
ids = []
docs = []

for f in glob(os.path.join(SUBTITLE_DIR, '*.vtt')):
    docs.append(vtt_to_txt(f))
    ids.append(f)

In [14]:
# preprocessing
docs = [re.sub('([0-9]+|<[^>]+>|</[^>]+>|\[[^\]]*\])', '', d.replace('\n', '')) for d in docs]

In [18]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(docs)

In [19]:
def get_top_n(csr_matrix, row, n = 10):
    # https://stackoverflow.com/questions/31790819/scipy-sparse-csr-matrix-how-to-get-top-ten-values-and-indices
    row = csr_matrix.getrow(row).toarray()[0].ravel()
    top_indicies = row.argsort()[::-1][:n]
    top_values = row[row.argsort()[::-1][:n]]
    
    return top_indicies, top_values

In [21]:
features = vectorizer.get_feature_names()

data = []

for i in range(len(docs)):
    top = [features[x] for x in get_top_n(X, i, 10)[0]]
    videoId = ids[i][-18:-7]
    link = 'https://youtu.be/' + videoId
    title = ids[i][12:-19]
    data.append({ 'videoId': videoId, 'title': title, 'link': link, 'keywords': ','.join(top) })

In [22]:
df = pd.DataFrame(data)

In [23]:
df.to_excel('top_keywords_from_subtitles.xlsx')

In [24]:
df.to_json('top_keywords_from_subtitles.json')