In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import unquote

In [None]:
# Define variables
input_dir = 'Pendings'
output_dir = 'D:/OneDrive/Downloads'
txt_file = 'DOWNLOAD.txt'
error_log_file = 'DOWNLOAD_ERRORS.txt'
timeout_seconds = 30

In [None]:
# Function to extract filename from Content-Disposition header
def get_filename_from_cd(content_disposition):
    if not content_disposition:
        return None
    fname = re.findall('filename\\*?=["\']?(?:UTF-8\'\')?([^"\';]+)["\']?;?', content_disposition, flags=re.IGNORECASE)
    if len(fname) == 1:
        return unquote(fname[0].strip())
    return None

# Function to extract filename from URL
def extract_filename(url):
    return unquote(url.split('/')[-1].split('?')[0])

# Modified download_file function to accept title_with_format
def download_file(url, session, output_dir, timeout, title_with_format):
    try:
        with session.get(url, stream=True, timeout=timeout) as r:
            r.raise_for_status()
            # Use title_with_format as filename if Content-Disposition header is not available
            filename = get_filename_from_cd(r.headers.get('content-disposition'))
            if not filename:
                filename = title_with_format

            file_path = os.path.join(output_dir, filename)
            if os.path.exists(file_path):
                print(f"File already exists: {filename}")
                return filename

            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
            return filename
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None

# Function to extract IPFS links from a LibGen page with timeout
def extract_ipfs_links(libgen_url, session, timeout):
    try:
        response = session.get(libgen_url, timeout=timeout)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            ipfs_links = [a['href'] for a in soup.find_all('a', href=True) if 'ipfs' in a['href']]
            return ipfs_links
        else:
            return []
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {libgen_url}: {e}")
        return []

# Function to try downloading from IPFS links
def try_download_from_ipfs(ipfs_links, session, timeout, title_with_format):
    for ipfs_url in ipfs_links:
        filename = download_file(ipfs_url, session, output_dir, timeout, title_with_format)
        if filename:
            return filename
    return None

def extract_ipfs_links_and_title(libgen_url, session, timeout):
    try:
        response = session.get(libgen_url, timeout=timeout)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            ipfs_links = [a['href'] for a in soup.find_all('a', href=True) if 'ipfs' in a['href']]
            # Extract title and format from the page (This part needs to be customized based on the actual page structure)
            title_tag = soup.find('h1')  # Example, adjust based on actual page structure
            title = title_tag.text if title_tag else 'Unknown_Title'
            format_tag = soup.find(lambda tag: tag.name == "p" and "Format:" in tag.text)
            book_format = format_tag.text.split("Format:")[-1].strip() if format_tag else 'pdf'  # Default to 'pdf' if format is unknown
            title_with_format = f"{title}.{book_format}"
            return ipfs_links, title_with_format
        else:
            return [], 'Unknown_Title.pdf'
    except requests.exceptions.RequestException as e:
        print(f"Error accessing {libgen_url}: {e}")
        return [], 'Unknown_Title.pdf'

def main():
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with requests.Session() as session, open(os.path.join(input_dir, txt_file), 'r') as file:
        urls = file.read().splitlines()
        with open(os.path.join(input_dir, error_log_file), 'a') as error_log:
            for url in urls:
                print(f"Starting download for {url}")
                ipfs_links, title_with_format = extract_ipfs_links_and_title(url, session, timeout_seconds)
                filename = try_download_from_ipfs(ipfs_links, session, timeout_seconds, title_with_format)
                if not filename:
                    print(f"Failed to download, logging: {url}")
                    error_log.write(url + '\n')
                time.sleep(2)

In [None]:
main()