In [119]:
from enum import Enum

import re
import pandas as pd
from tqdm import tqdm 
from pathlib import Path
from pytube import YouTube
from datetime import datetime
from urlextract import URLExtract

from googleapiclient.discovery import build

In [120]:
DEVELOPER_KEY = 'AIzaSyBqyzdb7oagxtoIQz08FimfidlbIi9awn0'
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

def with_youtube(func):
    def wrapper(*args, **kwargs):
        with build(API_SERVICE_NAME, API_VERSION, developerKey=DEVELOPER_KEY) as yt:
            return func(*args, **kwargs, yt=yt)
    return wrapper    

In [121]:
YT_ROOT_URL = "https://www.youtube.com/watch?v="

def get_description(id):
    video = YouTube(f"{YT_ROOT_URL}{id}")
    video.bypass_age_gate()
    return video.description

In [122]:
"""
https://stackoverflow.com/questions/17681670/extract-email-sub-strings-from-large-document
"""

valid_email_regex = re.compile(
    r"(?i)"  # Case-insensitive matching
    r"(?:[A-Z0-9!#$%&'*+/=?^_`{|}~-]+"  # Unquoted local part
    r"(?:\.[A-Z0-9!#$%&'*+/=?^_`{|}~-]+)*"  # Dot-separated atoms in local part
    r"|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]"  # Quoted strings
    r"|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")"  # Escaped characters in local part
    r"@"  # Separator
    r"[A-Z0-9](?:[A-Z0-9-]*[A-Z0-9])?"  # Domain name
    r"\.(?:[A-Z0-9](?:[A-Z0-9-]*[A-Z0-9])?)+"  # Top-level domain and subdomains
)

def isValid(email):
    """Check if the given email address is valid."""
    return True if re.fullmatch(valid_email_regex, email) else False

def get_emails(s: str):
    emails = re.findall(r'[\w.+-]+@[\w-]+\.[\w.-]+', s)
    return [(email, isValid(email)) for email in emails] 

In [123]:
"""
https://stackoverflow.com/questions/9760588/how-do-you-extract-a-url-from-a-string-using-python
"""
extractor = URLExtract()

class LinkType(Enum):
    EMAIL = 'email'
    INSTA = 'insta'
    OTHER = 'other'

def get_url_type(url: str):
    return LinkType.INSTA.value if 'instagram' in url else LinkType.OTHER.value

def get_urls(s: str):
    urls = extractor.find_urls(s)
    return [(url, get_url_type(url)) for url in urls if url.startswith("http")]

In [125]:
@with_youtube
def youtube_search(q, max_results=5, yt=None):
    
    res = yt.search().list(
        q=q,
        part='id,snippet',
        maxResults=max_results
    ).execute()

    videos = pd.DataFrame(columns=['title', 'published', 'id'])
    links = pd.DataFrame(columns=['title', 'id', 'link', 'type','valid'])

    for search_result in tqdm(res.get('items', [])):
        snippet = search_result['snippet']
        title = snippet['title']
        published = snippet['publishedAt']
        kind = search_result['id']['kind'].split('#')[-1]
        id = search_result['id'][{
            'video': 'videoId',
            'channel': 'channelId',
            'playlist': 'playlistId'
        }[kind]]

        if kind == 'video':
            videos.loc[len(videos)] = [title, published, id]
            
            desc = get_description(id)
            emails = get_emails(desc)
            urls = get_urls(desc)

            for email, valid in emails:
                links.loc[len(links)] = [title, id, email, LinkType.EMAIL.value, valid]
            
            for url, link_type in urls:
                links.loc[len(links)] = [title, id, url, link_type, True]

        elif kind == 'channel':
            pass

        elif kind == 'playlist':
            pass

    return videos, links


In [126]:
query = 'ninho type beat'
videos, links = youtube_search(query)
videos

100%|██████████| 5/5 [00:01<00:00,  4.41it/s]


Unnamed: 0,title,published,id
0,Ninho Type Beat | &quot;FACILE&quot; 🧨 Instru ...,2022-09-23T16:00:31Z,XcTtSn8hd40
1,[FREE] &quot;Mérite&quot; | SDM x Ninho Type B...,2023-05-28T19:00:10Z,T6CXEZR1N5g
2,Ninho Type Beat | &quot;PATRON&quot; ☎️ Instru...,2023-10-04T16:00:32Z,S6cVLLPHomc
3,(FREE) Ninho x Timal x Zkr Type Beat &quot;Arr...,2023-01-27T04:08:29Z,Xy1Z0eSUPaQ
4,[FREE] Ninho x Tiakola Type Beat - &quot;UN JO...,2024-02-11T17:00:07Z,X1CO_5XMvXQ


In [127]:
links

Unnamed: 0,title,id,link,type,valid
0,Ninho Type Beat | &quot;FACILE&quot; 🧨 Instru ...,XcTtSn8hd40,Kaneda.track@gmail.com,email,True
1,Ninho Type Beat | &quot;FACILE&quot; 🧨 Instru ...,XcTtSn8hd40,https://bsta.rs/a7e83deec,other,True
2,Ninho Type Beat | &quot;FACILE&quot; 🧨 Instru ...,XcTtSn8hd40,https://kanedabeatss.beatstars.com,other,True
3,Ninho Type Beat | &quot;FACILE&quot; 🧨 Instru ...,XcTtSn8hd40,https://www.instagram.com/kanedaaa_/?hl=fr,insta,True
4,[FREE] &quot;Mérite&quot; | SDM x Ninho Type B...,T6CXEZR1N5g,BrokenBeatsProd@gmail.com,email,True
5,[FREE] &quot;Mérite&quot; | SDM x Ninho Type B...,T6CXEZR1N5g,https://bsta.rs/371c0bced,other,True
6,[FREE] &quot;Mérite&quot; | SDM x Ninho Type B...,T6CXEZR1N5g,https://brokenbeatsprod.beatstars.com/,other,True
7,[FREE] &quot;Mérite&quot; | SDM x Ninho Type B...,T6CXEZR1N5g,https://www.instagram.com/brokenbeatsprod_,insta,True
8,[FREE] &quot;Mérite&quot; | SDM x Ninho Type B...,T6CXEZR1N5g,https://soundcloud.com/brokenbeatsprod,other,True
9,Ninho Type Beat | &quot;PATRON&quot; ☎️ Instru...,S6cVLLPHomc,Kaneda.track@gmail.com,email,True


In [128]:
links[links.type == LinkType.EMAIL.value]

Unnamed: 0,title,id,link,type,valid
0,Ninho Type Beat | &quot;FACILE&quot; 🧨 Instru ...,XcTtSn8hd40,Kaneda.track@gmail.com,email,True
4,[FREE] &quot;Mérite&quot; | SDM x Ninho Type B...,T6CXEZR1N5g,BrokenBeatsProd@gmail.com,email,True
9,Ninho Type Beat | &quot;PATRON&quot; ☎️ Instru...,S6cVLLPHomc,Kaneda.track@gmail.com,email,True
13,(FREE) Ninho x Timal x Zkr Type Beat &quot;Arr...,Xy1Z0eSUPaQ,prod.vlad.d@gmail.com,email,True
15,[FREE] Ninho x Tiakola Type Beat - &quot;UN JO...,X1CO_5XMvXQ,theswitsher71@gmail.com,email,True


In [129]:
out_dir = Path('.') / 'out'
out_dir.mkdir(exist_ok=True)

In [136]:
now = datetime.now()
now = now.strftime("%d-%m-%Y_%H-%M-%S")
folder = out_dir / f'{now}_{"-".join(query.split(" "))}'
folder.mkdir()

In [137]:
videos.to_csv(folder / 'videos.csv')
links.to_csv(folder / 'links.csv')

In [148]:
l = links.drop(['title'], axis=1).loc[links.type == LinkType.EMAIL.value]
table = pd.merge(videos, l, on='id')
print(table)
print('\n'.join(table['link']))

                                               title             published  \
0  Ninho Type Beat | &quot;FACILE&quot; 🧨 Instru ...  2022-09-23T16:00:31Z   
1  [FREE] &quot;Mérite&quot; | SDM x Ninho Type B...  2023-05-28T19:00:10Z   
2  Ninho Type Beat | &quot;PATRON&quot; ☎️ Instru...  2023-10-04T16:00:32Z   
3  (FREE) Ninho x Timal x Zkr Type Beat &quot;Arr...  2023-01-27T04:08:29Z   
4  [FREE] Ninho x Tiakola Type Beat - &quot;UN JO...  2024-02-11T17:00:07Z   

            id                       link   type  valid  
0  XcTtSn8hd40     Kaneda.track@gmail.com  email   True  
1  T6CXEZR1N5g  BrokenBeatsProd@gmail.com  email   True  
2  S6cVLLPHomc     Kaneda.track@gmail.com  email   True  
3  Xy1Z0eSUPaQ      prod.vlad.d@gmail.com  email   True  
4  X1CO_5XMvXQ    theswitsher71@gmail.com  email   True  
Kaneda.track@gmail.com
BrokenBeatsProd@gmail.com
Kaneda.track@gmail.com
prod.vlad.d@gmail.com
theswitsher71@gmail.com
