In [None]:
import os
import requests
from zipfile import ZipFile
from io import BytesIO
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Base URL containing subfolders
base_url = "https://www.3gpp.org/ftp/Specs/archive/23_series"

# Function to download and extract Word documents from a zip file
def download_and_extract_word_docs(zip_url, output_folder):
    response = requests.get(zip_url)
    with ZipFile(BytesIO(response.content)) as zip_file:
        # Extract each file individually
        for file_info in zip_file.infolist():
            zip_file.extract(file_info, output_folder)

# Function to process subfolders and extract Word documents
def process_subfolders(base_url, output_folder):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    for row in soup.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) >= 2 and 'href' in cells[1].a.attrs:
            subfolder_url = urljoin(base_url, cells[1].a['href'])
            print(f"Processing subfolder: {subfolder_url}")
            process_zip_folders(subfolder_url, output_folder)

# Function to process zip folders and extract Word documents
def process_zip_folders(url, output_folder):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Check if there's a parent folder link
    parent_folder_link = soup.find('a', text='Parent Directory')
    if parent_folder_link:
        parent_folder_url = urljoin(url, parent_folder_link['href'])
        process_zip_folders(parent_folder_url, output_folder)
        return

    # Iterate through rows in reverse order
    for row in reversed(soup.find_all('tr')):
        cells = row.find_all('td')
        if len(cells) >= 2 and 'href' in cells[1].a.attrs:
            zip_url = urljoin(url, cells[1].a['href'])
            print(f"Downloading and extracting from {zip_url}...")
            try:
                download_and_extract_word_docs(zip_url, output_folder)
                print("Extraction complete.")
                return  # Exit after processing the last zip file
            except Exception as e:
                print(f"Error processing {zip_url}: {e}")
                # Log the error or handle it as needed

# Output folder to store extracted Word documents
output_folder = r"D:\Work\IIT Bhilai\Internship\Specifications\series_23\docs"

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process subfolders and extract Word documents
try:
    process_subfolders(base_url, output_folder)
except Exception as e:
    print(f"Error occurred: {e}")
