# Whisper and Metaflow
This notebook demonstrates how to use Whisper to extract text from a YouTube video.
The content is based on this [blog post](https://outerbounds.com/blog/mlops-whisper-and-metaflow/).

# Example 1: Fly Me to the Moon

In [None]:
from youtube_utils import make_task
from nlp_utils import Mixin

In [None]:
url = "https://www.youtube.com/watch?v=ZEcqHA7dbwM"  # paste any YouTube URL
model_type = "small"
transcription_task = make_task(url, model_type)

In [None]:
%%time
nlp_tools = Mixin()
transcription = nlp_tools.transcribe_video(transcription_task, quiet=True)
transcription

# Example 2: Charlie Bit My Finger

In [None]:
url = "https://www.youtube.com/watch?v=0EqSXDwTq6U"
model_type = "tiny"
transcription_task = make_task(url, model_type)

In [None]:
%%time
nlp_tools = Mixin()
transcription = nlp_tools.transcribe_video(transcription_task, quiet=True)
transcription

# Example 3: Fireside Chat #1
* Video Time: 02:21:10

In [None]:
%%time
url = "https://www.youtube.com/watch?v=Dr6DsWa6Dhg"
model_type = "tiny"
transcription_task = make_task(url, model_type)
fs_chat_transcription = nlp_tools.transcribe_video(transcription_task)
fs_chat_transcription

# Running Flows

## Transcribe one Video

In [None]:
! python youtube_video_transcriber.py run --url 'https://www.youtube.com/watch?v=ZEcqHA7dbwM'

## Transcribe each Video in a Playlist

[This url](https://www.youtube.com/playlist?list=PLUsOvkBBnJBc1fcDQEOPJ77pMcE4CnNxc) goes to the playlist for Ville's [tagging blog](/blog/five-ways-to-use-the-new-metaflow-tags/). The playlist consists of 5 videos:
* [Basic Tagging](https://www.youtube.com/watch?v=DEmKaTI3MG4&list=PLUsOvkBBnJBc1fcDQEOPJ77pMcE4CnNxc&index=1): 05:41
* [Programmatic Tagging](https://www.youtube.com/watch?v=25Hqp43J37I&list=PLUsOvkBBnJBc1fcDQEOPJ77pMcE4CnNxc&index=2): 04:52
* [Tags and Namespaces](https://www.youtube.com/watch?v=ifARsmiSNhE&list=PLUsOvkBBnJBc1fcDQEOPJ77pMcE4CnNxc&index=3): 10:34
* [Tags in CI/CD](https://www.youtube.com/watch?v=hIiDXPHqEFM&list=PLUsOvkBBnJBc1fcDQEOPJ77pMcE4CnNxc&index=4): 03:28
* [Tags and Continuous Training](https://www.youtube.com/watch?v=lZhwhuG0AN8&list=PLUsOvkBBnJBc1fcDQEOPJ77pMcE4CnNxc&index=5): 04:33

In [None]:
! python youtube_video_transcriber.py run \
    --url 'https://www.youtube.com/playlist?list=PLUsOvkBBnJBc1fcDQEOPJ77pMcE4CnNxc'

In [None]:
# analysis
from metaflow import Flow

run = Flow("YouTubeVideoTranscription").latest_successful_run
run.data.results

## Transcribe a List of Videos

In [None]:
! python youtube_video_transcriber.py run --urls 'science_video_urls.txt'

# Analysis

In [None]:
import datetime as dt
from metaflow import Flow

run = Flow("YouTubeVideoTranscription").latest_successful_run
import humanize

msg = "Latest successful run was completed {}".format(
    humanize.naturaltime(dt.datetime.now() - run.created_at)
)
print(msg)

In [None]:
run.data.results

In [None]:
run.data.documents

In [None]:
# gather text from each example
# this will aggregate results in the postprocess step
text = " ".join(v.strip() for v in run.data.documents)

##  Word Cloud

In [None]:
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_gradient_magnitude
from wordcloud import WordCloud, ImageColorGenerator

In [None]:
stopwords = nlp_tools.aggregate_stopwords()

In [None]:
wordcloud = WordCloud(
    max_words=50,
    max_font_size=40,
    background_color="white",
    stopwords=stopwords,
    random_state=42,
).generate(text)

fig, ax = plt.subplots(1, 1)
plt.axis("off")
ax.imshow(wordcloud, interpolation="bilinear");

## Word Embeddings

In [None]:
import string


def get_sentences(document):
    "Return list of lists with inner list as each word in a sentence."
    return [
        list(
            map(
                lambda s: s.lower(),
                sentence.strip()
                .translate(str.maketrans("", "", string.punctuation))
                .split(),
            )
        )
        for sentence in document.split(".")
    ]

In [None]:
stopwordless_document = [
    list(filter(lambda word: word not in stopwords, sentence))
    for sentence in get_sentences(text)
]

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot
from gensim.models import Word2Vec

SEED = 33
MIN_COUNT = 5
PCA_COMPONENTS = 2
fs_chat_id = 1

model = Word2Vec(stopwordless_document, min_count=MIN_COUNT, seed=SEED)
words = list(model.wv.index_to_key)  # vocabulary
model.save("model.bin")  # save model
new_model = Word2Vec.load("model.bin")  # load model
X = model.wv[model.wv.index_to_key]

pca = PCA(n_components=PCA_COMPONENTS, random_state=SEED)  # dim reduction
result = pca.fit_transform(X)

fig, ax = plt.subplots(1, 1, figsize=(7, 7))
ax.scatter(result[:, 0], result[:, 1])
for i, word in enumerate(list(model.wv.index_to_key)):
    ax.annotate(word, xy=(result[i, 0] + 5e-4, result[i, 1] + 5e-4), rotation=0)
ax.spines[["top", "right", "left", "bottom"]].set_visible(False)
ax.set_title("Fireside Chat {} Projected by Word2Vec".format(fs_chat_id), y=1.04)
pyplot.show()