In [10]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import json
import re

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver



def fetch_texts(url):
    driver = setup_driver()
    texts_mon = ""
    texts_eng = ""
    title = ""

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title using the new method
        title_tag = soup.find('p', style=lambda value: value and 'padding-left:' in value)
        title = title_tag.get_text(strip=True) if title_tag else "Title not found"

        # Fetch Mongolian text
        element_mon = soup.find('div', class_='maincontenter w-100 pull-left')
        if element_mon:
            texts_mon = element_mon.get_text(strip=True).replace("Хэвлэх", "")

        # Navigate to the page with Selenium to check for the active tab
        driver.get(url)
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href="#active-tab-11"]')))
            # Fetch English text
            element_eng = soup.find('div', class_='w-100 pull-left --nomigration')
            if element_eng:
                texts_eng = element_eng.get_text(strip=True)
        except:
            # Handle cases where the active tab 11 does not exist
            texts_eng = "not."

    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
    finally:
        driver.quit()

    return {'url': url, 'title': title, 'mon': texts_mon, 'eng': texts_eng}

# List of URLs to process

# Use ThreadPoolExecutor to handle multiple URLs in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    results = list(executor.map(fetch_texts, all_links))

# Write results to JSONL file
with open('output3.jsonl', 'w', encoding='utf-8') as f:
    for result in results:
        json_line = json.dumps({"title": result['title'], "mon": result['mon'], "eng": result['eng']}, ensure_ascii=False)
        f.write(json_line + '\n')

print("Processing complete. Data written to output.jsonl.")


Processing complete. Data written to output.jsonl.


# title zow shaah bha

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Setup Chrome with Selenium using Service
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

num_pages = 1
base_url = 'https://legalinfo.mn/mn/law?page=law&cate=27&active=1&sort=title&page='

all_links = []
all_titles = []

for page_num in range(1, num_pages+1):
    url = base_url + str(page_num)
    driver.get(url)
    wait = WebDriverWait(driver, 10) 

    try:
        # Wait for the 'shine-huuli-main' div to be loaded
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.shine-huuli-main"))
        )
        time.sleep(1)
        # Extract information
        html_source = driver.page_source
        soup = BeautifulSoup(html_source, 'html.parser')

        # Find the container with the specified class
        container = soup.find("div", {"class": "shine-huuli-main"})
        if container:
            # Extract links and titles that have the specific class "act-name"
            acts = container.select('a.act-name')
            links = [a['href'] for a in acts]
            titles = [a.text.strip() for a in acts]
            all_links.extend(links)
            all_titles.extend(titles)
            print(f"Links and Titles from Page {page_num}: {list(zip(links, titles))}")
        else:
            print("No container with the specified class found.")

    finally:
        # Ensure the browser is closed even if an error occurs
        print("End of Page", page_num)
        time.sleep(1)

print("All Links:", all_links)
print("All Titles:", all_titles)
driver.quit()


Links and Titles from Page 1: [('https://legalinfo.mn/mn/detail?lawId=1', 'АВЛИГЫН ЭСРЭГ ХУУЛИЙН ЗАРИМ ЗААЛТЫГ ДАГАЖ МӨРДӨХ ЖУРМЫН ТУХАЙ'), ('https://legalinfo.mn/mn/detail?lawId=8928', 'АВЛИГЫН ЭСРЭГ ХУУЛЬ'), ('https://legalinfo.mn/mn/detail?lawId=28', 'АВТОБЕНЗИН, ДИЗЕЛИЙН ТҮЛШНИЙ АЛБАН ТАТВАРЫН ТУХАЙ'), ('https://legalinfo.mn/mn/detail?lawId=12656', 'АВТО ЗАМЫН ТУХАЙ /Шинэчилсэн найруулга/'), ('https://legalinfo.mn/mn/detail?lawId=29', 'АВТОТЭЭВРИЙН ТУХАЙ ХУУЛЬ'), ('https://legalinfo.mn/mn/detail?lawId=30', 'АГААРЫН БОХИРДЛЫН ТӨЛБӨРИЙН ТУХАЙ'), ('https://legalinfo.mn/mn/detail?lawId=31', 'АГААРЫН  ЗАЙГ НИСЭХЭД АШИГЛАХ ТУХАЙ'), ('https://legalinfo.mn/mn/detail?lawId=9384', 'АГААРЫН ТУХАЙ ХУУЛИЙГ ХҮЧИНГҮЙ  БОЛСОНД ТООЦОХ ТУХАЙ'), ('https://legalinfo.mn/mn/detail?lawId=8669', 'АГААРЫН ТУХАЙ /Шинэчилсэн найруулга/'), ('https://legalinfo.mn/mn/detail?lawId=9395', 'АГНУУРЫН НӨӨЦ АШИГЛАСНЫ ТӨЛБӨР,  АН АМЬТАН АГНАХ, БАРИХ ЗӨВШӨӨРЛИЙН  ХУРААМЖИЙН ТУХАЙ ХУУЛИЙГ ХҮЧИНГҮЙ  БОЛСОНД ТООЦОХ ТУХАЙ'

In [2]:
all_links = ['https://legalinfo.mn/mn/detail?lawId=1', 'https://legalinfo.mn/mn/detail?lawId=8928', 'https://legalinfo.mn/mn/detail?lawId=28']
all_titles= ['АВЛИГЫН ЭСРЭГ ХУУЛИЙН ЗАРИМ ЗААЛТЫГ ДАГАЖ МӨРДӨХ ЖУРМЫН ТУХАЙ', 'АВЛИГЫН ЭСРЭГ ХУУЛЬ', 'АВТОБЕНЗИН, ДИЗЕЛИЙН ТҮЛШНИЙ АЛБАН ТАТВАРЫН ТУХАЙ']

In [5]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import json
import re

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def fetch_texts(url, title):
    driver = setup_driver()
    texts_mon = ""
    texts_eng = ""

    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Fetch Mongolian text
        element_mon = soup.find('div', class_='maincontenter w-100 pull-left')
        if element_mon:
            texts_mon = element_mon.get_text(strip=True).replace("Хэвлэх", "")

        # Navigate to the page with Selenium to check for the active tab
        driver.get(url)
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href="#active-tab-11"]')))
            element_eng = soup.find('div', class_='w-100 pull-left --nomigration')
            if element_eng:
                texts_eng = element_eng.get_text(strip=True)
        except:
            texts_eng = "not."

    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
    finally:
        driver.quit()

    return {'title': title, 'mon': texts_mon, 'eng': texts_eng}

all_links = ['https://legalinfo.mn/mn/detail?lawId=1', 'https://legalinfo.mn/mn/detail?lawId=8928', 'https://legalinfo.mn/mn/detail?lawId=28']
all_titles= ['АВЛИГЫН ЭСРЭГ ХУУЛИЙН ЗАРИМ ЗААЛТЫГ ДАГАЖ МӨРДӨХ ЖУРМЫН ТУХАЙ', 'АВЛИГЫН ЭСРЭГ ХУУЛЬ', 'АВТОБЕНЗИН, ДИЗЕЛИЙН ТҮЛШНИЙ АЛБАН ТАТВАРЫН ТУХАЙ']

with ThreadPoolExecutor(max_workers=5) as executor:
    results = executor.map(fetch_texts, all_links, all_titles)

with open('output5.jsonl', 'w', encoding='utf-8') as f:
    for result in results:
        json_line = json.dumps(result, ensure_ascii=False)
        f.write(json_line + '\n')

print("Processing complete. Data written to output3.jsonl.")


Processing complete. Data written to output3.jsonl.
