# CloudScriber:
# A Seamless YouTube Audio Transcriber and WordCloud Generator
*v1.2*

Created by Ben Nachlas


To get started, click "Runtime" from the toolbar and select "Run All."

Begin by pasting a valid YouTube URL into the box below and hitting enter:

In [None]:
#prompts user to enter YouTube URL
#note - currently does not check for errors. Make sure to enter valid URL. 
youtube_video_url = input("What YouTube video would you like to analyze? ")


Install required packages

In [None]:
#can take a few moments, recommend commenting these out on subsequent runs after initialization
!pip install git+https://github.com/openai/whisper.git -q
!pip install pytube -q
!pip install wordcloud

Import required resources

In [None]:
import datetime
import whisper
import numpy as np 
import pandas as pd 
import urllib.request
import matplotlib.pyplot as plt 
import textwrap

from pytube import YouTube
from PIL import Image 
from wordcloud import WordCloud, STOPWORDS

Initiate the generator

In [None]:
#record starting timestamp
t_initial = datetime.datetime.now()
#print(f"Generator started {t_initial}")

print("Beginning transcription...")

#call Whisper to load audio to text transcripition model
model = whisper.load_model('base')

##  disregard any FP16 or FP32 errors that may arise; it will still run  

Acquire video information from YouTube

In [None]:
# retrieves URL from user input, assigns video to variable
#youtube_video_url = "URL_GOES_HERE"
youtube_video = YouTube(youtube_video_url)


#select the playback stream desired for download and transcription. Defaults to first in the list (lowest quality) for fastest results
#locally saves and renames download file
streams = youtube_video.streams.filter(only_audio=True)
stream = streams.first()
stream.download(filename='yt_downloaded.mp4')

Transcribe audio from YouTube video and store text

In [None]:
#save a timestamp before transcription
t_pre_transcription = datetime.datetime.now()
#print(f"Audio transcription started at {t_pre_transcription}")

#use Whisper to convert audio  to text
#processor intensive -- comment out to save time when testing
output = model.transcribe("yt_downloaded.mp4")

#record timestamp after transcription is complete
t_post_transcription = datetime.datetime.now()
#print(f"Audio transcription finished at {t_post_transcription}")

#pull text string from Whisper model output 
#show success message 
transcription = output['text']
print("\n\n")
print("The audio to text transcription of " + "\"" + youtube_video.title + "\"," + " which was uploaded by " + youtube_video.author + ", has completed. \n")
print("Creating wordcloud...")
print("\n\n")

# un-comment to view the entire transcription text
# print("")
# print(transcription)
# print("")

Retrieve and create image mask for custom wordcloud shape outline



In [None]:
#record timestamp at wordcloud initialization
t_pre_cloud = datetime.datetime.now()
#print(f"Wordcloud generation started at {t_pre_cloud}")

#downloads image file from url to be used as mask
#can try replacing url to use different mask shape but currently this is not 100% functional and may generate errors or unintended results
urllib.request.urlretrieve('http://clipart-library.com/img1/1082052.png', 'og_img_mask.png')

#save original image as mask and covert to numpy array
#commenting this out since the final result uses the inverted mask, not the original
#regular_img_mask = np.array(Image.open('og_img_mask.png'))

#invert mask colors for better results

#opens the original image mask and convert it to a numpy array
mask_image = np.array(Image.open('og_img_mask.png'))

#invert mask colors
inverted_mask_image = 255 - mask_image

# #un-comment to display the inverted image mask
# fig = plt.figure()
# fig.set_figwidth(6) # set width
# fig.set_figheight(4) # set height
# plt.imshow(inverted_mask_image, cmap=plt.cm.gray, interpolation='bilinear')
# plt.axis('off')
# plt.show()

Construct wordcloud from transcribed text



In [None]:
#define list of stopwords to be excluded from wordcloud 
stopwords = set(STOPWORDS)

#fine-tune stopword list with custom input (.add line-by-line)
stopwords.add('word1')
stopwords.add('word2')
stopwords.add('etc')

#define parameters and create wordcloud object 
#set collocations to 'True' to include pairs of words like 'red wine' that would otherwise be considered separate  
wordcloud = WordCloud(background_color='white', max_words=5000, mask=inverted_mask_image, stopwords=stopwords, collocations=False)

#generate the wordcloud using transcribed audio
wordcloud.generate(transcription)

#record timestamp at wordcloud completion
t_post_cloud = datetime.datetime.now()
#print(f"Wordcloud generation completed at {t_post_cloud}")

Display resulting wordcloud, reuturn operational details

In [None]:
#display the resulting wordcloud
fig = plt.figure()
fig.set_figwidth(16) 
fig.set_figheight(14) 
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
print("")

#display operation details
print("Details: \n")

#calculate original video information:
print("Video title: " + str(youtube_video.title))
print("Uploaded by: " + str(youtube_video.author))
video_length = youtube_video.length
minutes, seconds = divmod(video_length, 60)
print(f"Original video length: {minutes} minutes and {seconds} seconds\n")

#calculate time to transcribe audio
audio_transcription_time = t_post_transcription - t_pre_transcription
minutes, seconds = divmod(audio_transcription_time.total_seconds(), 60)
print(f"The audio to text transcription took {int(minutes)} minutes and {int(seconds)} seconds.")

#calculate time to generate wordcloud
wordcloud_time = t_post_cloud - t_pre_cloud
minutes, seconds = divmod(wordcloud_time.total_seconds(), 60)
print(f"It took {int(minutes)} minutes and {int(seconds)} seconds to generate the wordcloud.")

#calculate overall operation time
total_time = t_post_cloud - t_initial
minutes, seconds = divmod(total_time.total_seconds(), 60)
print(f"The total overall working time was {int(minutes)} minutes and {int(seconds)} seconds. \n")
print("Here is the best attempt at a complete audio-to-text transcription from the provided video:\n")

#set the width (in characters) of the output text
width = 150

#use `wrap` method to compile list of wrapped lines
wrapped_lines = textwrap.wrap(transcription, width=width)

#iterate through the list of wrapped lines and print them
for line in wrapped_lines:
    print(line)
print("\n\n")

print("----Operation complete!----\n")
print("Thank you for using the CloudScriber YouTube Audio Transcriber and WordCloud Generator v1.2")