In [19]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
import pandas as pd  # Import pandas for DataFrame functionality

# Setup Chrome options
options = Options()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Setup the Chrome WebDriver with the correct arguments
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# Function to search for Australian YouTube channels with more than 50K subscribers


def search_youtube_channels(query):
    query = query.replace(' ', '+')
    base_url = f"https://www.youtube.com/results?search_query={query}&sp=EgIQAg%253D%253D"
    driver.get(base_url)
    time.sleep(5)  # Let the page load

    # Scroll down the page to load more results
    for i in range(10):
        driver.execute_script(
            "window.scrollTo(0, document.documentElement.scrollHeight);")
        time.sleep(3)

    # Parse the page content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    channels = soup.find_all(
        'div', class_='style-scope ytd-channel-renderer', id='info')

    channel_data = []  # List to hold the data

    for channel in channels:
        metadata = channel.find('div', id="metadata")
        title_element = metadata.find('yt-formatted-string', id='subscribers')
        subscriber_element = channel.find('span', id='video-count')
        if title_element and subscriber_element:
            title = title_element.text.strip()
            subscribers = subscriber_element.text.strip()
            subs = subscribers.split(" ")[0]
            if subs == '' or subs[:-1] == '':
                continue
            
            sub_count = float(subs[:-1])
            
            if subs[-1] in "Kk":
                sub_count *= 1000
            elif subs[-1] in "Mm":
                sub_count *= 1000000

            if sub_count > 50000:
                # Attempt to find the parent 'a' tag that usually contains the 'href'
                link_element = title_element.find_parent('a')
                if link_element and 'href' in link_element.attrs:
                    link = f"https://www.youtube.com{link_element['href']}"
                    channel_data.append({
                        "Channel": title,
                        "Subscribers": subscribers,
                        "Link": link
                    })

    # Convert list to DataFrame
    df = pd.DataFrame(channel_data)
    return df


# Call the function to search channels
df = search_youtube_channels("Australian channels")
driver.quit()  # Close the WebDriver
df

Unnamed: 0,Channel,Subscribers,Link
0,@abcnewsaustralia,2.01M subscribers,https://www.youtube.com/@abcnewsaustralia
1,@cricketcomau,9.23M subscribers,https://www.youtube.com/@cricketcomau
2,@DiscoveryAustralia,661K subscribers,https://www.youtube.com/@DiscoveryAustralia
3,@GardeningAustralia,232K subscribers,https://www.youtube.com/@GardeningAustralia
4,@7news,1.68M subscribers,https://www.youtube.com/@7news
...,...,...,...
144,@CosmicKidsYoga,1.58M subscribers,https://www.youtube.com/@CosmicKidsYoga
145,@natachaoceane,1.68M subscribers,https://www.youtube.com/@natachaoceane
146,@MiddleEastEye,2.03M subscribers,https://www.youtube.com/@MiddleEastEye
147,@DAVINEJAYRUGRAT,513K subscribers,https://www.youtube.com/@DAVINEJAYRUGRAT


In [20]:
# link
df.to_csv('youtube_channels.csv', index=False)  # Save DataFrame to CSV
