In [1]:
from dotenv import load_dotenv # add this line
load_dotenv()

True

In [2]:
import whisper
import torch  # install steps: pytorch.org

from tqdm.auto import tqdm  # !pip install tqdm
from pathlib import Path

import os
import json

import time

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = whisper.load_model("base").to(device)

In [4]:
import os
import googleapiclient.discovery
import urllib.parse as p
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
import json
def obj_dict(obj):
    return obj.__dict__

import subprocess
import requests
import re
import youtube_dl

In [5]:
from pytube import YouTube  # !pip install pytube
from pytube.exceptions import RegexMatchError
from tqdm.auto import tqdm  # !pip install tqdm


In [6]:
# Disable OAuthlib's HTTPS verification when running locally.
# *DO NOT* leave this option enabled in production.

api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = os.getenv('DEVELOPER_KEY')

os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey = DEVELOPER_KEY)

In [7]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

# Use a service account.
cred = credentials.Certificate('../analytics-c4f16-firebase-adminsdk-plgdk-2a3d65b3a8.json')

app = firebase_admin.initialize_app(cred)

db = firestore.client(app)
batch = db.batch()

In [8]:
def check_nested_dict_keys(dic, keys):
    d = dic
    for key in keys:
        if key in d.keys():
            d = d[key]
            continue
        else:
            return dic['snippet']['thumbnails']['default']['url'] 
        
    return d

In [9]:
class DatabaseHelper():
    
    def __init__(self, db):
        self.db = db
        self.CHANNELS = None
        self.VIDEOS = {}
    
    def get_channel_db(self,db):
        docs = self.db.collection(u'channels').stream()
        channelsCollection = {}
        for doc in docs:
            channelsCollection[doc.id]=doc.to_dict()
        return channelsCollection
    
    def set_channels(self):
        self.CHANNELS = self.get_channel_db(db)
        
    def set_videos(self,_id,videos):
        self.VIDEOS[_id] = videos

    def get_channel_videos(self,channelId):

        docs = db.collection(u'Videos').order_by('publishedAt', direction=firestore.Query.DESCENDING).where(u"channelID",u"==",u"{}".format(channelId)).stream()

        collection = []
        for doc in docs:
            vid =doc.to_dict()
            if "#shorts" not in vid['title']:
                collection.append(vid)
        return collection
    
    def get_channel_doc(self, video_url, allChannels):
        channel_id = video_url_to_channel_id(video_url)
        print(channel_id)

        for record in allChannels.values():
            if record['channelId'] == channel_id:
                print("Channel {} already exists.".format(record['title']))
                return record

        channelToAdd = generate_channel_json(video_url)    


        add = input("Create channel for url ? (0 for no, any other key for yes)".format(channelToAdd['title']))

        if add == 0:
            print("No channel for url")
            return


        docRef = db.collection(u"channels").document(u"{}".format(channelToAdd['channelId'])).set(channelToAdd)
        print("Channel {} is created.".format(channelToAdd['title']))
        return channelToAd

   


In [10]:
def generate_channel_json(sourceValue, source="url"):
    
    if source=="id":
        channel_id = sourceValue
    elif source=="url":
        video_url = sourceValue
        channel_id = video_url_to_channel_id(video_url)
    else:
        raise Exception("{} not a valid source.".format(source))
            
    request = youtube.channels().list(
            part="snippet,contentDetails",
            id=channel_id
        )
    response = request.execute()
    
    channel_df = {}#pd.DataFrame(columns=['channelId', 'title', 'uploads','description','thumbnail'])
    channel_df['channelId'] = channel_id
    channel_df['title'] = response['items'][0]['snippet']['title']
    channel_df['uploads'] = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    channel_df['description'] = response['items'][0]['snippet']['description']
    channel_df['thumbnail'] = check_nested_dict_keys(response['items'][0],["snippet","thumbnails","high","url"])
    channel_df['videos'] = []
    
    #full_channel_df = pd.concat([full_channel_df, pd.DataFrame.from_dict(channel_df)])

    return channel_df

def video_url_to_channel_id(video_url):
    video_param_from_url = video_url.split("/watch?v=")[1].split("&")[0]
    request = youtube.videos().list(
            part="snippet,contentDetails",
            id=video_param_from_url
        )
    response = request.execute()
    channel_id = response['items'][0]['snippet']['channelId']
    return channel_id


def get_channel_doc(video_url, allChannels):
    channel_id = video_url_to_channel_id(video_url)
    print(channel_id)
    
    for record in allChannels.values():
        if record['channelId'] == channel_id:
            print("Channel {} already exists.".format(record['title']))
            return record

    channelToAdd = generate_channel_json(video_url)    


    add = input("Create channel for url ? (0 for no, any other key for yes)".format(channelToAdd['title']))

    if add == 0:
        print("No channel for url")
        return
    

    docRef = db.collection(u"channels").document(u"{}".format(channelToAdd['channelId'])).set(channelToAdd)
    print("Channel {} is created.".format(channelToAdd['title']))
    return channelToAdd

In [11]:
def get_uploads_id(youtube, channel_id):
    request = youtube.channels().list(
            part="snippet,contentDetails",
            id=channel_id
        )
    response = request.execute()
    return response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

def get_uploaded_videos_response(youtube, channel_id,**args):
    
    playlist_id = get_uploads_id(youtube, channel_id)
    max_total=args.get("max_total") if args.get("max_total") else 1000
    oldest_date=args.get("oldest_date") if args.get("oldest_date") else False

    
    params = {
        "part":"snippet,contentDetails",
        "playlistId": playlist_id,
        "maxResults": 50
    }
    
    uploaded_videos_content_details_list = []

    while True:
        response = youtube.playlistItems().list(**params).execute()
        
        
        if oldest_date:
                
            oldestResponseDate = response.get('items')[-1]['snippet']['publishedAt'].split("T")[0]
            if oldestResponseDate <= oldest_date:
                add_response = []
                for r in response.get('items'):
                    date = r['snippet']['publishedAt'].split("T")[0]
                    if date <= oldest_date:
                        break
                    else:
                        add_response.append(r)
                videos_list = uploaded_videos_content_details_list + add_response
                print("New videos found: {}".format(len(videos_list)))
                print("Newest video: {}, {}".format(videos_list[0]['snippet']['title'],videos_list[0]['snippet']['publishedAt']))
                print("Oldest video: {}, {}".format(videos_list[-1]['snippet']['title'],videos_list[-1]['snippet']['publishedAt']))


                return videos_list

      
        uploaded_videos_content_details_list = uploaded_videos_content_details_list + response.get('items')
        
        if (max_total and len(uploaded_videos_content_details_list) >= max_total):
            return uploaded_videos_content_details_list[:max_total]
     
        elif 'nextPageToken' in response.keys():
            params['pageToken'] = response['nextPageToken']
            continue
        else:
            break

    print("Videos retreived.")
    return uploaded_videos_content_details_list

def getYoutubeDuration(videoId):
    responseVideoDetails = youtube.videos().list( part="contentDetails",id=videoId).execute()
    durationResponse=responseVideoDetails['items'][0]['contentDetails']['duration']

    duration_string = durationResponse.replace('PT',"")
    number_values = re.findall('\d+',duration_string)
    symbols_available= ''.join([i for i in duration_string if not i.isdigit()])
    symbol_map = {}
    for symbol in 'HMS':
        index = symbols_available.find(symbol)
        if index > -1:
            symbol_map[symbol] = number_values[index]

    duration = 0


    for idx in symbol_map:
        if idx == "H":
            duration = int(symbol_map[idx])*60*60 + duration
        if idx == "M":
            duration = int(symbol_map[idx])*60 + duration
        if idx == "S":
            duration = int(symbol_map[idx]) + duration
            
    return duration

def video_response_list_to_list_of_dicts(response_list):
    
    list_of_dicts = []
    for x in response_list:
        if "#shorts" in x['snippet']['title']:
            continue
            
        video_df = {}
        video_df['id'] = x['id']
        video_df['videoId'] = x['contentDetails']['videoId'] 
        video_df['title'] = x['snippet']['title'] 
        video_df['description'] = x['snippet']['description'] 
        video_df['thumbnail'] = check_nested_dict_keys(x,["snippet","thumbnails","maxres","url"])      
        video_df['channelID'] = x['snippet']['channelId'] 
        video_df['videoId'] = x['contentDetails']['videoId'] 
        video_df['publishedAt'] =x['snippet']['publishedAt'].split("T")[0] 
        video_df['videoUrl'] = "https://www.youtube.com/watch?v={}".format(x['contentDetails']['videoId'] )
        video_df['duration'] = getYoutubeDuration(x['contentDetails']['videoId']  )
        
        video_df['text'] = ''
#         video_df['assemblySentences'] ='' 
#         video_df['assemblyText'] ='' 
#         video_df['availability']=''
#         video_df['assemblyId'] = ''
#         video_df['ytaTranscript'] = ''
#         video_df['mainTranscript'] = ''
#         video_df['ytaAvailability'] = ''


        list_of_dicts.append(video_df)


    return list_of_dicts

In [12]:
def pullChannel(url, DB, oldest_date=None):

    #Create channel
    channel = get_channel_doc(url,DB.CHANNELS)
    
    videos_collections= DB.get_channel_videos(channel['channelId'])#load all videos
    
    DB.set_videos(channel['channelId'],videos_collections)
    #check for videos not yet uploaded

    if videos_collections and oldest_date:
        oldest_date = videos_collections[0]['publishedAt']
    else:
        oldest_date = None
    new_video_responses = get_uploaded_videos_response(youtube, channel['channelId'],
                                               oldest_date=oldest_date)

    list_of_new_video_responses_as_dicts = video_response_list_to_list_of_dicts(new_video_responses)


    return channel, list_of_new_video_responses_as_dicts

#add videos to transcript execution

In [13]:
def getCost(videoList):
    cost = 0
    for vid in videoList:
        cost = vid['duration']*0.00025 + cost
    return cost

In [32]:
def transcript_object(video, text, result,source):
    item = {}
    item['videoId'] = video['videoId']
    item['transcriptId'] = 'TRANSCRIPT_'+video['videoId']
    
    keys = ['start','end','text','id']
    segments = result['segments']
    item['transcript'] = [{ keep: item[keep] for keep in keys } for item,i in zip(segments,range(len(segments)) )]
    item['fullTranscript'] = result
    item['text'] = text
    item['source'] = source

    return item

In [15]:
def get_audio(url,_id):
    yt = YouTube(url)
    video = yt.streams.filter(only_audio=True).first()
    out_file=video.download(output_path="audio_files")
    base, ext = os.path.splitext(out_file)
    new_file = 'audio_files/'+_id+'.mp3'
    os.rename(out_file, new_file)
    a = new_file
    return a

def transcribe(video):
    
    metric = {}
    result = {}
    _id = video['videoId']


    try:
        start = time.time()

        get_audio(video['videoUrl'],_id)
        print('Downloaded {}'.format(video['videoId']))


        # transcribe to get speech-to-text data
        result = model.transcribe('audio_files/{}.mp3'.format(_id))

        # add results to data list
        with open('transcript_files/{}.json'.format(_id), 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=4)
        os.remove("audio_files/{}.mp3".format(_id))
        source = "whisper"
        print('Transcribed {}'.format(video['videoId']))



    except Exception as e: # work on python 3.x

        print("video error: {}".format(_id))

     
        try:
            print('Could not get streaming data. Attempting assembly...')
            ydl_opts = {'format': 'bestaudio'}


            ydl =youtube_dl.YoutubeDL(ydl_opts)

            videoUrl = video['videoUrl']
            info = ydl.extract_info(videoUrl, download=False)
            audioUrl = info['formats'][0]['url']
            requestForTrancript = postTranscript(audioUrl)
            source = "pendng"
        except Exception as e: # work on python 3.x
            print('Could not get assembly data.')

            source = "fail"


    
    metric['id'] = _id
    metric['time'] = time.time() - start
    
    return result, metric,source


In [16]:
# get_audio("https://www.youtube.com/watch?v=7hrSj5qkHv4")

In [17]:
DB = DatabaseHelper(db)
DB.set_channels()

In [18]:
url= "https://www.youtube.com/watch?v=lBCOOTyU46M&t=512s"
channel, new_videos = pullChannel(url,DB)

UCamLstJyCa-t5gfZegxsFMw
Channel Colin and Samir already exists.
Videos retreived.


In [49]:
docs = DB.db.collection(u'Transcripts').stream()
transcripts_list = []
for doc in docs:
    try:
        transcripts_list.append(doc.to_dict()['videoId'])
    except:
        continue

In [55]:
# result, metric, source = transcribe(save)
done_1 = ['gU24yrixXD8','irIN7SHvLDc','lBCOOTyU46M','iq71Cb2jEIE','iq71Cb2jEIE']
done_2 = ['1t5oYKEn-1E', 'IjoTYJNr8DA', 'xp2VGAjHZWY', '3QZkjsMfj3I', 'gX0mrw1uZcY', 'NphQGsm4rvk', 'OE4ti4alRN8', 'w4iUuroktxw', 'yNLqaQ6slkw', 'N5YW4JB07-8', 'gGBCbswZbnI', '0bC1ah_x8zo', 'hYaGD0V2OkE', 'wkDlfvTed1c', '7qoe2qhcZ-Y', '88067BiKU4Y', 'jzMsnNxzejI', 'pvtMJFPyiLM', '3vbZvRHpM8w', '7hrSj5qkHv4', 'VbNIh88Nq5k', 'XuVR_elE1Pw', 'r8bzWKBvZsE', '9CxaZWkwHzE', 'x5lBJE2Ok8E', 'z_czmz_bJqk', 't69cK3Ih_Og', 'BB2HTaXTy0k', 'HmrjOq8epsg', 'MLyEDp7e0Q8', '9cn_r1z6zjo', 'o8UBXsiiS24', 'SpdWaOngRWM', 'jz_qFyTrS8w', 'Dh909TbYn7Y', 'G5c6qof96DM', 'UZSwDZ72Lp8', 'dbOXYhjpXAc', 'YVuIm8OLz-8', 'r-Y1LRtsFaU', '_TAxmgPQtzc']
done_3 = ['nxyThD3-GTw','i05bI03nzv4','vwtRRdmZSYw', 'xIC1nS4DU6M', 'JoI4BRPd8us', 'PiGCHXt5eBs', '6OICqRSTRjY', '71Xz6bYoRGc', '7PIlp-FSN5I', 'vDGnnLLXGTo']
error = ['7zd6EA5GdEM']
done = done_1+done_2+done_3+error+transcripts_list
l = [v for v in new_videos if ((v['videoId']not in done )and not (v['text'] == 'error') and (v['duration'] <= 45*60))]
(getCost(l[:10])/0.00025/60/60)*0.5
l1 = l[:5]
l2 = l[5:10]
l3 = l[10:15]

TypeError: 'bool' object is not callable

In [22]:
t = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
t[0:5],t[5:10],t[10:15]

([0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14])

In [23]:
(getCost(l)/0.00025/60/60)*0.5


1.3834722222222229

In [24]:
# save = new_videos[6]
# t = [v for v in new_videos if ((v['videoId']not in done )and (v['duration'] >0))]

# (getCost(t)/0.00025/60/60)*0.5
# result, metric, source = transcribe(save)
# done_1 = ['gU24yrixXD8','irIN7SHvLDc','lBCOOTyU46M','iq71Cb2jEIE','iq71Cb2jEIE']
# done_2 = ['1t5oYKEn-1E', 'IjoTYJNr8DA', 'xp2VGAjHZWY', '3QZkjsMfj3I', 'gX0mrw1uZcY', 'NphQGsm4rvk', 'OE4ti4alRN8', 'w4iUuroktxw', 'yNLqaQ6slkw', 'N5YW4JB07-8', 'gGBCbswZbnI', '0bC1ah_x8zo', 'hYaGD0V2OkE', 'wkDlfvTed1c', '7qoe2qhcZ-Y', '88067BiKU4Y', 'jzMsnNxzejI', 'pvtMJFPyiLM', '3vbZvRHpM8w', '7hrSj5qkHv4', 'VbNIh88Nq5k', 'XuVR_elE1Pw', 'r8bzWKBvZsE', '9CxaZWkwHzE', 'x5lBJE2Ok8E', 'z_czmz_bJqk', 't69cK3Ih_Og', 'BB2HTaXTy0k', 'HmrjOq8epsg', 'MLyEDp7e0Q8', '9cn_r1z6zjo', 'o8UBXsiiS24', 'SpdWaOngRWM', 'jz_qFyTrS8w', 'Dh909TbYn7Y', 'G5c6qof96DM', 'UZSwDZ72Lp8', 'dbOXYhjpXAc', 'YVuIm8OLz-8', 'r-Y1LRtsFaU', '_TAxmgPQtzc']
# error = ['7zd6EA5GdEM']
# done = done_1+done_2
# nums = range(len(new_videos))

In [25]:
# done_1 = ['gU24yrixXD8','irIN7SHvLDc','lBCOOTyU46M','iq71Cb2jEIE','iq71Cb2jEIE']
# done_2 = ['1t5oYKEn-1E', 'IjoTYJNr8DA', 'xp2VGAjHZWY', '3QZkjsMfj3I', 'gX0mrw1uZcY', 'NphQGsm4rvk', 'OE4ti4alRN8', 'w4iUuroktxw', 'yNLqaQ6slkw', 'N5YW4JB07-8', 'gGBCbswZbnI', '0bC1ah_x8zo', 'hYaGD0V2OkE', 'wkDlfvTed1c', '7qoe2qhcZ-Y', '88067BiKU4Y', 'jzMsnNxzejI', 'pvtMJFPyiLM', '3vbZvRHpM8w', '7hrSj5qkHv4', 'VbNIh88Nq5k', 'XuVR_elE1Pw', 'r8bzWKBvZsE', '9CxaZWkwHzE', 'x5lBJE2Ok8E', 'z_czmz_bJqk', 't69cK3Ih_Og', 'BB2HTaXTy0k', 'HmrjOq8epsg', 'MLyEDp7e0Q8', '9cn_r1z6zjo', 'o8UBXsiiS24', 'SpdWaOngRWM', 'jz_qFyTrS8w', 'Dh909TbYn7Y', 'G5c6qof96DM', 'UZSwDZ72Lp8', 'dbOXYhjpXAc', 'YVuIm8OLz-8', 'r-Y1LRtsFaU', '_TAxmgPQtzc']
# done_3 = ['vwtRRdmZSYw', 'xIC1nS4DU6M', 'JoI4BRPd8us', 'PiGCHXt5eBs', '6OICqRSTRjY', '71Xz6bYoRGc', '7PIlp-FSN5I', 'vDGnnLLXGTo']
# error = ['7zd6EA5GdEM']
# done = done_1+done_2+done_3+error
#     l = [v for v in l if v['videoId'] == "PDZ6sYal5A0"]

In [26]:
import ray

In [47]:
def TranscribeList(l):
    data = []
    metrics= []
    done = []
    i=0
    for doc in l:

        if doc['videoId'] not in done:
            print(i)
        else:
            print("already transcribed.")
            continue
        i = i+1

        result, metric, source = transcribe(doc)

        if source == "whisper":
            data.append(result)
            metrics.append(metric)


      
            doc['text'] = result['text']
            
            transcript = transcript_object(doc, result['text'],result, source)

            docRef = DB.db.collection(u"Videos").document(u"{}".format(doc['videoId'])).set(doc)
            docRef = DB.db.collection(u"Transcripts").document(u"{}".format(doc['videoId'])).set(transcript)

            done.append(doc)
            print(metric['time']," -- ",doc["duration"], " -- ", print(metric['time']/doc["duration"]))


        else:
            doc['text'] = 'error'
            #transcript = transcript_object(doc, result['segments'], source)

            docRef = DB.db.collection(u"Videos").document(u"{}".format(doc['videoId'])).set(doc)
    #         docRef = DB.db.collection(u"Transcripts").document(u"{}".format(doc['videoId'])).set(transcript)



In [53]:
l[0]['text']

'error'

In [52]:
import warnings
warnings.filterwarnings("ignore")
TranscribeList([l[0]])

0
video error: cbBxEmGOfk4
Could not get streaming data. Attempting assembly...
[youtube] cbBxEmGOfk4: Downloading webpage
[youtube] cbBxEmGOfk4: Refetching age-gated info webpage


ERROR: Sign in to confirm your age
This video may be inappropriate for some users.


Could not get assembly data.


In [None]:
# import ray

# ray.shutdown()
# ray.init()

# # Start two tasks in the background.
# x_id = TranscribeList.remote(l1)
# y_id = TranscribeList.remote(l2)
# z_id = TranscribeList.remote(l3)


# # Block until the tasks are done and get the results.
# x, y, z = ray.get([x_id, y_id, z_id])

In [None]:
# from multiprocessing import Pool
# # import generate_transcripts_script as defs
# pool = Pool()

# result1 = pool.apply_async(TranscribeList, [l1,"A"])    # evaluate "solve1(A)" asynchronously
# result2 = pool.apply_async(TranscribeList, [l2,"B"])    # evaluate "solve2(B)" asynchronously
# result3 = pool.apply_async(TranscribeList, [l3,"C"])    # evaluate "solve2(B)" asynchronously

# answer1 = result1.get()
# answer2 = result2.get()
# answer3 = result3.get()