In [None]:
!pip install pandas selenium webdriver_manager aiohttp

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import display, clear_output


def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def fetch_html(url):
    
    try:
        driver.get(url)
        html_source = driver.page_source
        # print(f'{url}... Success')
    except Exception as e:
        html_source = None
        print(f"Error fetching {url}: {e}")
    return html_source


df = pd.read_parquet('torrchef-dataset.parquet')
# df = df.head(20)

to_fetch = df[df['IndexerHTMLSource'].isna() & df['IndexerTorrentListingUrl'].notna()]


def update_html(index, url):
    html_source = fetch_html(url)
    return index, html_source

driver = init_driver()

total_tasks = len(to_fetch)
completed_tasks = 0
start_time = time.time()

pool = ThreadPoolExecutor(max_workers=200)
with pool as executor:
    future_to_index = {executor.submit(update_html, index, row['IndexerTorrentListingUrl']): index for index, row in to_fetch.iterrows()}
    
    for future in as_completed(future_to_index):
        index = future_to_index[future]
        try:
            index, html_source = future.result()
            df.at[index, 'IndexerHTMLSource'] = html_source
            completed_tasks += 1
            elapsed_time = time.time() - start_time
            rate_per_minute = (completed_tasks / elapsed_time) * 60
            clear_output(wait=True)
            display_msg = f"Progress: {completed_tasks}/{total_tasks} sites completed - Fetch Rate: {rate_per_minute:.2f} sites/min - Max Workers: {pool._max_workers}"
            display(display_msg)
        except Exception as e:
            print(f"Error updating index {index}: {e}")


df.to_parquet('withIndexerHtmlSource.parquet', index=False)
driver.quit()

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import display, clear_output

def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def fetch_html(driver, url):
    try:
        driver.get(url)
        html_source = driver.page_source
    except Exception as e:
        html_source = None
        print(f"Error fetching {url}: {e}")
    return html_source

def update_html(driver, index, url):
    html_source = fetch_html(driver, url)
    return index, html_source

df = pd.read_parquet('torrchef-dataset.parquet')

to_fetch = df[df['IndexerHTMLSource'].isna() & df['IndexerTorrentListingUrl'].notna()]

driver = init_driver()

total_tasks = len(to_fetch)
completed_tasks = 0
start_time = time.time()

display(display_msg := None)

with ThreadPoolExecutor(max_workers=200) as executor:
    futures = {}
    for index, row in to_fetch.iterrows():
        future = executor.submit(update_html, driver, index, row['IndexerTorrentListingUrl'])
        futures[future] = index

    for future in as_completed(futures):
        index = futures[future]
        try:
            index, html_source = future.result()
            df.at[index, 'IndexerHTMLSource'] = html_source
            completed_tasks += 1
            elapsed_time = time.time() - start_time
            rate_per_minute = (completed_tasks / elapsed_time) * 60
            clear_output(wait=True)
            display_msg = f"Progress: {completed_tasks}/{total_tasks} tasks completed - Rate: {rate_per_minute:.2f} tasks/min"
            display(display_msg)
        except Exception as e:
            print(f"Error updating index {index}: {e}")

clear_output(wait=True)
print(display_msg)

df.to_parquet('withIndexerHtmlSource.parquet', index=False)

driver.quit()


In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor, as_completed
from IPython.display import display, clear_output
import time
import threading

thread_local = threading.local()

def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def get_driver():
    if not hasattr(thread_local, "driver"):
        thread_local.driver = init_driver()
    return thread_local.driver

def fetch_html(url):
    driver = get_driver()
    try:
        driver.get(url)
        html_source = driver.page_source
    except Exception as e:
        html_source = None
        print(f"Error fetching {url}: {e}")
    return html_source

def update_html(index, url):
    html_source = fetch_html(url)
    return index, html_source

df = pd.read_parquet('torrchef-dataset.parquet')


to_fetch = df[df['IndexerHTMLSource'].isna() & df['IndexerTorrentListingUrl'].notna()]

total_tasks = len(to_fetch)
completed_tasks = 0
start_time = time.time()

display(display_msg := None)

with ThreadPoolExecutor(max_workers=36) as executor:
    futures = {executor.submit(update_html, index, row['IndexerTorrentListingUrl']): index for index, row in to_fetch.iterrows()}
    
    for future in as_completed(futures):
        index = futures[future]
        try:
            index, html_source = future.result()
            df.at[index, 'IndexerHTMLSource'] = html_source
            completed_tasks += 1
            elapsed_time = time.time() - start_time
            rate_per_minute = (completed_tasks / elapsed_time) * 60
            clear_output(wait=True)
            display_msg = f"Progress: {completed_tasks}/{total_tasks} tasks completed - Rate: {rate_per_minute:.2f} tasks/min"
            display(display_msg)
        except Exception as e:
            print(f"Error updating index {index}: {e}")

clear_output(wait=True)
print(display_msg)

df.to_parquet('withIndexerHtmlSource.parquet', index=False)

for _ in range(36):
    driver = getattr(thread_local, "driver", None)
    if driver:
        driver.quit()
        del thread_local.driver


Progress: 614090/614090 tasks completed - Rate: 321.73 tasks/min
