<a href="https://colab.research.google.com/github/petermesy/Machine-Learning-Projects/blob/main/Peter_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
pip install pdfplumber pytesseract pdf2image langdetect duckduckgo-search



In [9]:
import os
import requests
import pdfplumber
from langdetect import detect
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from urllib.parse import urlparse
from pdf2image import convert_from_path
import pytesseract
import json

In [11]:
DOWNLOAD_FOLDER = "Ethio_laws_pdf"
os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)

In [12]:
def search_pdf_urls(query, max_results=1000):
    pdf_urls = []
    with DDGS() as ddgs:
        for result in ddgs.text(query + " filetype:pdf", max_results=max_results):
            url = result.get("href") or result.get("url")
            if url and url.lower().endswith(".pdf"):
                pdf_urls.append(url)
    return pdf_urls

In [13]:
def download_pdf(url, folder):
    try:
        response = requests.get(url, timeout=None)
        if response.status_code == 200 and 'application/pdf' in response.headers.get('Content-Type', ''):
            filename = os.path.basename(urlparse(url).path)
            filepath = os.path.join(folder, filename)
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {filename}")
        else:
            print(f"Skipped (not a valid PDF): {url}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

In [14]:
def extract_text_and_detect_language(pdf_path):
    try:
        text = ""
        # Try extracting text using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""

        # If no text is extracted, use OCR
        if not text.strip():
            print(f"No text found in {pdf_path}. Using OCR...")
            images = convert_from_path(pdf_path)
            for image in images:
                text += pytesseract.image_to_string(image, lang="eng+amh")  # Add Amharic OCR if needed

        # Detect language
        language = detect(text)
        return text, language
    except Exception as e:
        print(f"Failed to extract text from {pdf_path}: {e}")
        return None, None

In [None]:
if __name__ == "__main__":
    # Change the search query to target Amharic PDFs
    search_query = "የፍትህ አዲስ ደንብ አዲስ ሕግ የፌዴራል ሕጎች filetype:pdf"
    pdf_links = search_pdf_urls(search_query, max_results=1000)

    # Save PDF links to a JSON file
    pdf_links_file = os.path.join(DOWNLOAD_FOLDER, "pdf_links.json")
    with open(pdf_links_file, "w", encoding="utf-8") as json_file:
        json.dump(pdf_links, json_file, ensure_ascii=False, indent=4)
    print(f"PDF links saved to: {pdf_links_file}")

    # Download PDFs
    for link in pdf_links:
        download_pdf(link, DOWNLOAD_FOLDER)

    # Extract text and detect language for each downloaded PDF
    for filename in os.listdir(DOWNLOAD_FOLDER):
        if filename.endswith(".pdf"):
            filepath = os.path.join(DOWNLOAD_FOLDER, filename)
            text, language = extract_text_and_detect_language(filepath)
            if text and language:
                print(f"Extracted Text from {filename}:\n{text[:100]}...")  # Print first 100 characters of extracted text
                output_filename = os.path.splitext(filename)[0] + ".txt"  # Replace .pdf with .txt
                output_filepath = os.path.join(DOWNLOAD_FOLDER, output_filename)
                with open(output_filepath, "w", encoding="utf-8") as text_file:
                    text_file.write(text)
                print(f"Extracted text saved to: {output_filepath}")
                print(f"Detected Language: {language}")

PDF links saved to: Ethio_laws_pdf/pdf_links.json
Downloaded: ethiopias-transitional-justice-policy-.pdf
Downloaded: 7802d845-49a8-4249-a58a-eb8c41b06646.pdf
Skipped (not a valid PDF): https://www.lawethiopia.com/images/addis+ababa/Regulation+Number+125-2022+Addis+Ababa+prosecutors+regulation.pdf
Skipped (not a valid PDF): https://habeshadvocates.com/Pdf-Files/304253.pdf
Downloaded: ETH101059.pdf
Downloaded: transitional-justice-draft-stamped.pdf
