In [197]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException

import time
from typing import List, Tuple
from bs4 import BeautifulSoup
import json
import re
from datetime import datetime
from pathlib import Path

In [218]:
def search_for_keyword(driver, keyword):
    try:
        search_box = WebDriverWait(driver, 20).until(
        EC.presence_of_element_located(
            (By.ID, "broi_form:_idJsp61"))
    )

        print("element exists")
    except NoSuchElementException:
        print("element not found")
    

    search_box.clear()
    search_box.send_keys(keyword)
    search_box.send_keys(Keys.RETURN)
    #print(f"Current URL: {driver.current_url}")

In [219]:
def format_date(date_string: str) -> str:
    """
    Formats the date string to YYYY-MM-DD format.
    You may need to adjust this function based on the actual date format on the website.
    """
    # This is a placeholder implementation. Adjust as needed.
    from datetime import datetime
    date_obj = datetime.strptime(date_string, "%d.%m.%Y")
    return date_obj.strftime("%Y-%m-%d")

In [220]:
def save_metadata(keyword: str, metadata: dict):
    metadata_folder = os.path.join(f'data/raw/parliament', keyword.replace(':', '').replace(' ', '_'), 'metadata')
    os.makedirs(metadata_folder, exist_ok=True)
    print(metadata_folder)
    metadata_file_name = os.path.join(metadata_folder, f"metadata_{metadata['notified_date']}-{metadata['name']}.json")
    with open(metadata_file_name, 'w', encoding='utf-8') as metadata_file:
        json.dump(metadata, metadata_file, ensure_ascii=False, indent=4)

In [221]:
def save_summary(keyword: str, url: str, date: str, name: str, description: str):
        text_folder = os.path.join(f'data/raw/parliament', keyword.replace(':', '').replace(' ', '_'), 'text')
        os.makedirs(text_folder, exist_ok=True)
        summary_file_name = os.path.join(text_folder, f"{date}-{name}.txt")
        with open(summary_file_name, 'w', encoding='utf-8') as summary_file:
            summary_file.write(f"Title: {name}\n")
            summary_file.write(f"Distribution date: {date}\n")
            summary_file.write(f"Keywords: {keyword}\n")
            summary_file.write(f"Summary: {description}\n")

In [237]:
# Give the keyword and download dir for pdf files
mykeyword = "Competition Law"

download_dir = "C:\\Users\\10138283\\OneDrive - NTT DATA Business Solutions AG\\Desktop\\myprojects\\ds-sisecam-webscrapping\\data\\raw\\parliament\\"+mykeyword.replace(' ', '_')+"\\pdf"
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : download_dir}
chrome_options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(options=chrome_options)
base_url = "https://dv.parliament.bg/DVWeb/broeveList.faces"
driver.get(base_url)

def get_urls(driver, keyword: str, limited_page: int):
    
    time.sleep(2)
    search_for_keyword(driver, keyword)
    time.sleep(2)   
    file_names = []

    for page in range(2, limited_page + 1):
        try:
            no=0 # For accessing related item with its id, no incrementing +1 after every iteration

            tbody = driver.find_element(By.ID, "broi_form:dataTable1:tbody_element")
            td_elements = tbody.find_elements(By.CLASS_NAME, "td_tabResult0")

            print("Page has:" +str(len(td_elements))+ " item")

            # Extract and print dates
            for i in range(len(td_elements)):
                url = ""
                try:
                    # Extract text content
                    tbody = driver.find_element(By.ID, "broi_form:dataTable1:tbody_element")
                    # Find all td elements with class "td_tabResult0"
                    td_elements = tbody.find_elements(By.CLASS_NAME, "td_tabResult0")

                    text = td_elements[i].text
                    splitted_text = text.split(",")
                    name = splitted_text[0].strip().replace('/', '_').replace(':', '').replace(' ', '_').replace('\n', '_')
                    print("Document Name: "+name)

                    # Extract Date
                    date = re.findall(r'\b\d{1,2}\.\d{1,2}\.\d{4}\b', splitted_text[1])[0]
                    formatted_date = format_date(date)

                    file_names.append(str(formatted_date)+"-"+name+".pdf")
                    # Download button clik
                    link = tbody.find_element(By.ID, f"broi_form:dataTable1:{no}:_idJsp109")
                    link.click()
                    time.sleep(2)

                
                    if driver.find_element(By.ID, "broi_form:end_fixed_div").is_displayed():
                        print("tablo gözüktü")
                        table = driver.find_element(By.CLASS_NAME, "border2.white")
                        links = table.find_elements(By.TAG_NAME, "a")
                        first_link = links[0]
                        url = first_link.get_attribute('href')
                        first_link.click()
                        time.sleep(2)

                        close_button = table.find_element(By.XPATH, "/html/body/div/form/div[2]/div/table/tbody/tr[1]/td[2]/img")
                        close_button.click()
                        print(f"Link text: {links[0].text}, Href: {url}")

                    else:
                        url = link.get_attribute('href')
                        print(f"Link text: {link.text}, Href: {url}")   

                except NoSuchElementException:
                    print("Table not found on the page.")
                no +=1
                print(no)
                if url == "":
                    url = "https://dv.parliament.bg/DVWeb/broeveList.faces#"
                
                metadata_dict = {
                "name": name,
                "notified_date": formatted_date,
                "notified_country": None,
                "URL": url,
                "keyword": mykeyword
                }
                save_metadata(keyword=mykeyword, metadata=metadata_dict)
                save_summary(keyword=mykeyword, url=url, date=formatted_date, name=name, description="This is a pdf file.")
                time.sleep(2)
            # Changing Page Part
            page_element = driver.find_element(By.ID, "broi_form:selectPageTop")
            select = Select(page_element)
            select.select_by_value(str(page))
            
            print(f"Scraping page {page}")
            time.sleep(4)
        
        except NoSuchElementException:
            print("No more pages to scrape")
            break
    downloaded_file = [os.path.join(download_dir, f) for f in os.listdir(download_dir)]
    # Dosyaları oluşturulma zamanına göre sıralıyoruz
    downloaded_file.sort(key=os.path.getctime)
    time.sleep(2)
    for old_file, new_name in zip(downloaded_file, file_names):
        # Yeni dosya yolunu oluştur
        new_file_path = os.path.join(download_dir, new_name)
        
        # Eski dosyanın ismini yenisiyle değiştir
        os.rename(old_file, new_file_path)

        print(f"Renamed: {old_file} to {new_file_path}")
    driver.quit()

get_urls(driver, mykeyword, 5)

element exists
Page has:1 item
Document Name: Брой_71
Table not found on the page.
1
data/raw/parliament\Competition_Law\metadata
No more pages to scrape
Renamed: C:\Users\10138283\OneDrive - NTT DATA Business Solutions AG\Desktop\myprojects\ds-sisecam-webscrapping\data\raw\parliament\Competition_Law\pdf\2018-08-28-Брой_71.pdf to C:\Users\10138283\OneDrive - NTT DATA Business Solutions AG\Desktop\myprojects\ds-sisecam-webscrapping\data\raw\parliament\Competition_Law\pdf\2018-08-28-Брой_71.pdf
