In [20]:
import requests
import fitz  # PyMuPDF
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re

# Path to your local chromedriver
chromedriver_path = "C:\\chromedriver-win64\\chromedriver.exe"

# Set up Selenium WebDriver
options = Options()
options.headless = False
service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=options)

# Open the URL
url = "https://www.cbsl.gov.lk/en/statistics/economic-indicators/price-report"
driver.get(url)

# Find the PDF link
first_pdf_link = driver.find_element(
    By.XPATH, "//a[contains(@href, 'price_report') and contains(@href, '.pdf')]"
).get_attribute("href")

print(f"The first Daily Price Report PDF link is: {first_pdf_link}")

# Download PDF
pdf_response = requests.get(first_pdf_link, verify=False)
pdf_path = "daily_price_report.pdf"
with open(pdf_path, 'wb') as f:
    f.write(pdf_response.content)

# Extract second page text
doc = fitz.open(pdf_path)
page = doc.load_page(1)
page_text = page.get_text("text")

# Close browser and doc
driver.quit()
doc.close()

# Process the text
lines = [line.strip() for line in page_text.split("\n") if line.strip()]

# Regex pattern to find the date in the header (assumes date format like "17 April 2025")
date_pattern = r"\d{1,2}\s\w+\s\d{4}"

# Search for the date in the text
date_match = re.search(date_pattern, page_text)

# If a date is found, use it. If not, default to "Unknown"
date = date_match.group() if date_match else "Unknown"

# CSV output
csv_filename = "price_report.csv"
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Item", "Unit", "Yesterday Price", "Today Price", "Date"])

    # Extracting the data
    i = 0
    while i < len(lines) - 3:
        item = lines[i]
        unit_line = lines[i + 1]
        yesterday_price_line = lines[i + 2]
        today_price_line = lines[i + 3]

        if "Rs./" in unit_line:
            unit = unit_line
            yesterday_price = yesterday_price_line.split()[0] if yesterday_price_line.split() else "n.a."
            today_price = today_price_line.split()[0] if today_price_line.split() else "n.a."
            writer.writerow([item, unit, yesterday_price, today_price, date])
            i += 4  # Move to next block
        else:
            i += 1  # Skip if format doesn't match

print(f"\n✅ CSV with item, unit, yesterday price, today price, and date saved as: {csv_filename}")


The first Daily Price Report PDF link is: https://www.cbsl.gov.lk/sites/default/files/cbslweb_documents/statistics/pricerpt/price_report_20250417_e.pdf





✅ CSV with item, unit, yesterday price, today price, and date saved as: price_report.csv


In [19]:
# import requests
# import fitz  # PyMuPDF
# import csv
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options

# # Path to your local chromedriver
# chromedriver_path = "C:\\chromedriver-win64\\chromedriver.exe"

# # Set up Selenium WebDriver
# options = Options()
# options.headless = False
# service = Service(chromedriver_path)
# driver = webdriver.Chrome(service=service, options=options)

# # Open the URL
# url = "https://www.cbsl.gov.lk/en/statistics/economic-indicators/price-report"
# driver.get(url)

# # Find the PDF link
# first_pdf_link = driver.find_element(
#     By.XPATH, "//a[contains(@href, 'price_report') and contains(@href, '.pdf')]"
# ).get_attribute("href")

# print(f"The first Daily Price Report PDF link is: {first_pdf_link}")

# # Download PDF
# pdf_response = requests.get(first_pdf_link, verify=False)
# pdf_path = "daily_price_report.pdf"
# with open(pdf_path, 'wb') as f:
#     f.write(pdf_response.content)

# # Extract second page text
# doc = fitz.open(pdf_path)
# page = doc.load_page(1)
# page_text = page.get_text("text")

# # Close browser and doc
# driver.quit()
# doc.close()

# # Process the text
# lines = [line.strip() for line in page_text.split("\n") if line.strip()]

# # CSV output
# csv_filename = "first_four_columns.csv"
# with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
#     writer = csv.writer(file)
#     writer.writerow(["Item", "Unit", "Wholesale Price", "Retail Price"])

#     i = 0
#     while i < len(lines) - 3:
#         item = lines[i]
#         unit_line = lines[i + 1]
#         wholesale_price_line = lines[i + 2]
#         retail_price_line = lines[i + 3]

#         if "Rs./" in unit_line:
#             unit = unit_line
#             wholesale_price = wholesale_price_line.split()[0] if wholesale_price_line.split() else "n.a."
#             retail_price = retail_price_line.split()[0] if retail_price_line.split() else "n.a."
#             writer.writerow([item, unit, wholesale_price, retail_price])
#             i += 4  # Move to next block
#         else:
#             i += 1  # Skip if format doesn't match

# print(f"\n✅ CSV with item, unit, wholesale price, and retail price saved as: {csv_filename}")


The first Daily Price Report PDF link is: https://www.cbsl.gov.lk/sites/default/files/cbslweb_documents/statistics/pricerpt/price_report_20250417_e.pdf





✅ CSV with item, unit, yesterday price, today price, and date saved as: price_report_with_date.csv
