In [1]:
from youtubesearchpython import VideosSearch
import spacy
import youtube_dl
import webvtt
import html
import os

nlp = spacy.load('en_core_web_lg')

In [2]:
def to_vtt_filename(filename):
    spl = filename.split('.')
    return '.'.join(spl[:-1]) + '.en.vtt'


def fetch_transcription_file(youtube_id):
    ydl_opts = {
        'writeinfojson': 'info.json',
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'subtitlesformat': 'vtt',
        'skip_download': True,
        'writeinfojson': False,
        'quiet': True,
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(f"https://www.youtube.com/watch?v={youtube_id}", download=True)
        fn = ydl.prepare_filename(info)

    vtt_file = f"subtitles/{youtube_id}.vtt"
    os.replace(to_vtt_filename(fn), vtt_file)
    return vtt_file

def to_transcription(vtt_file):
    transcr_groups = [html.unescape(caption.text).strip().split('\n') for caption in webvtt.read(vtt_file)]
    transcr_lines = []
    for line in [x.strip() for x in sum(transcr_groups, [])]:
        if line == "":
            continue
        if len(transcr_lines) == 0:
            transcr_lines.append(line)
        elif transcr_lines[-1] != line:
            transcr_lines.append(line)
    return ' '.join(transcr_lines)

def sentencise(text):
    return [f"{sent.text}." for sent in nlp(text).sents]

def to_days(wordy_time):
    if 'Streamed ' in wordy_time:
        wordy_time = wordy_time[9:]
    value = int(wordy_time.split(' ')[0])
    cases = {
        'minute': 1/(24*60),
        'hour': 1/24,
        'day': 1,
        'week': 7,
        'month': 30,
        'year': 365,
    }
    for word, k in cases.items():
        if word in wordy_time:
            return value * k
    raise Exception(f"Unit of time not understood: {wordy_time}")

def get_latest_videos(channel, days=7):
    video_search = VideosSearch(channel, limit=days*3)
    results = video_search.result()
    output = []
    for result in results['result']:
        try:
            days_old = to_days(result['publishedTime'])
        except TypeError: # upcoming stream
            continue
        if days_old <= days:
            output.append({'id': result['id'], 'days_old': days_old, 'metadata': result})
    return output

def process_video(video_id, show=False):
    if os.path.exists(f"subtitles/{video_id}.vtt"):
        vtt_file = f"subtitles/{video_id}.vtt"
        print(f"Subtitles found locally for video: {video_id}")
    else:
        vtt_file = fetch_transcription_file(video_id)
        print(f"Subtitles processed: {video_id}")
    transcription = to_transcription(vtt_file)
    sentences = sentencise(transcription)
    if show:
        print('\n\n'.join(sentences))
    return sentences

def process_channel(channel, days=7, show=False):
    print(f"Now processing channel: {channel}")
    if show:
        print('\n' + "-" * 100 + '\n')
    output = []
    for video in get_latest_videos(channel, days=days):
        try:
            video['subtitles'] = process_video(video['id'], show=show)
            video['channel'] = channel
            output.append(video)
        except FileNotFoundError:
            print(f"Subtitles not (yet) available for video: {video['id']}")
        if show:
            print('\n' + "-" * 100 + '\n')
    return output

In [3]:
def all_coin_mentions(coin, mentions):
    return sum([x for _, x in mentions[coin].items()], [])

In [18]:
channels = [
    'MyFinancialFriend', 'thekaleshow', 'AlexBeckersChannel', 'AltcoinDaily', 'CryptosRUs', 
    'OnlyTheSAVVY', 'MoneyZG', 'SheldonEvansx', 'MMCryptoTube', 'JasonPizzinoOfficial', 
    'FUDTV', 'BrianJungy', 'CryptoBanterGroup', 'JamieTree', 'UCjemQfjaXAzA-95RKoy9n_g', 
    'INEVITRADE', 'CoinBureau', 'TheCryptoLark',
]

data = []

for channel in channels:
    data += process_channel(channel, days=7)

Now processing channel: MyFinancialFriend
Subtitles found locally for video: jWbUaR2VyDo
Subtitles found locally for video: s7FcrGEfOKY
Subtitles found locally for video: j-_c5je1whY
Subtitles found locally for video: voisKBy0DiA
Subtitles found locally for video: LHq_Cyuw3e0
Subtitles found locally for video: ivBZNF22XgQ
Subtitles found locally for video: rD1PvJLhm5g
Subtitles found locally for video: RK0MrcKCwCE
Subtitles found locally for video: cIjjA6B_Opw
Subtitles found locally for video: Mv5PGtt_pD4
Subtitles found locally for video: qy_hE0KhKe4
Subtitles found locally for video: f65dTEmPv70
Now processing channel: thekaleshow
Subtitles processed: 8mFfGRjDr7k
Subtitles found locally for video: WuTM4h_u58c
Subtitles processed: TxRAux9Otyo
Subtitles found locally for video: 4m6ndv8vRJI
Subtitles found locally for video: D5RvMGj4IeU
Subtitles found locally for video: 5ts98CchG20
Now processing channel: AlexBeckersChannel
Subtitles found locally for video: jLdF2qt_uWQ
Subtitles foun

In [19]:
coins = [
    'Bitcoin', 'Ethereum', 'Cardano', 'Solana', 'Polygon', 'Litecoin', 'Binance coin', 'Sandbox', 'Decentraland',
    'Dogecoin', 'Shiba Inu', 'Axis', 'Polkadot',
]

mentions = {coin: {} for coin in coins}

for video in data:
    for sentence in video['subtitles']:
        for coin in mentions:
            if coin.lower() in sentence.lower():
                if video['channel'] not in mentions[coin]:
                    mentions[coin][video['channel']] = []
                mentions[coin][video['channel']].append(sentence.lower().replace(coin.lower(), f"[{coin.upper()}]"))

import pandas as pd

mentions_data = []
for coin, channels in mentions.items():
    mentions_data.append({'coin': coin, 'num_channels': len(channels), 
                          'total_mentions': len(all_coin_mentions(coin, mentions))})
pd.DataFrame(mentions_data)

Unnamed: 0,coin,num_channels,total_mentions
0,Bitcoin,16,1274
1,Ethereum,16,619
2,Cardano,15,224
3,Solana,15,196
4,Polygon,10,80
5,Litecoin,6,12
6,Binance coin,3,5
7,Sandbox,13,97
8,Decentraland,10,52
9,Dogecoin,5,12


In [20]:
print('\n\n'.join(all_coin_mentions('Bitcoin', mentions)))

heck [BITCOIN]s holding up really well compared to a lot of these other a lot of these other stocks.

i mean heck the nasdaq is down as much as [BITCOIN] in the last 24 hours.

so i wouldn't be surprised if [BITCOIN] fell another thousand dollars or something like that.

but i'm not banking on that either if we see [BITCOIN] fall down to fifty three thousand seven hundred fifty three thousand eight hundred.

i think that i will be picking up some more [BITCOIN].

[BITCOIN] goes up to eighty thousand dollars.

you might feel fomo that you didn't have it in [BITCOIN].

sam today we are looking at what just happened in the market because we saw huge sell-offs 24-hour sell-offs and [BITCOIN] or 20 20 sell-offs in [BITCOIN] more than that in a lot of other cryptocurrencies.

now we have [BITCOIN] that spike down on this chart.

el salvador bought 150 more [BITCOIN] hey they are looking long-term on this.

everything being adopted every single bank wants a piece of [BITCOIN] or crypto and th