In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import urllib3
import time
import fitz  # PyMuPDF
import pandas as pd
import os
import re

from pdf2image import convert_from_path
import pytesseract

# 🔧 Path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# 🔧 Disable SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# WebDriver config
chrome_driver_path = "C:\\chromedriver-win64\\chromedriver.exe"
options = webdriver.ChromeOptions()
options.headless = False
driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

# Visit CBSL page
url = "https://www.cbsl.gov.lk/en/measures-of-consumer-price-inflation"
driver.get(url)
print("Waiting for the page to load...")
time.sleep(5)

# Get inflation report links
inflation_links = WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, "Inflation"))
)

# Use the latest report
first_link = inflation_links[0]
print(f"\n📄 Processing report: {first_link.text.strip()}")
first_link.click()
time.sleep(3)

# Find PDF link
pdf_elements = WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '.pdf')]"))
)

pdf_url = None
for el in pdf_elements:
    href = el.get_attribute("href")
    if "inflation" in href.lower() and "ccpi" in href.lower():
        pdf_url = href
        break

if not pdf_url:
    print("❌ No PDF link found.")
    driver.quit()
    exit()

print(f"📎 PDF URL: {pdf_url}")

# Download PDF
pdf_response = requests.get(pdf_url, verify=False)
local_pdf_path = "temp_report.pdf"
if pdf_response.status_code == 200:
    with open(local_pdf_path, 'wb') as f:
        f.write(pdf_response.content)
    print(f"✅ PDF downloaded: {local_pdf_path}")
else:
    print("❌ Failed to download PDF.")
    driver.quit()
    exit()

# Read page 2 text
doc = fitz.open(local_pdf_path)
page_text = doc[1].get_text() if len(doc) >= 2 else ""
doc.close()

if not page_text.strip():
    print("⚠️ No extractable text. Using OCR...")
    images = convert_from_path(local_pdf_path, first_page=2, last_page=2)
    if images:
        page_text = pytesseract.image_to_string(images[0])

os.remove(local_pdf_path)

# Process lines to extract Year, Month, CCPI
lines = [line.strip() for line in page_text.split('\n') if line.strip()]
current_year = ""
data = []
months = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]

i = 0
while i < len(lines):
    line = lines[i]
    # Detect Year
    year_match = re.match(r'20\d{2}', line)
    if year_match:
        current_year = year_match.group()
        i += 1
        continue

    # Detect Month + CCPI pattern
    month_match = next((m for m in months if m in line), None)
    if current_year and month_match:
        # Try to find CCPI as a float after month
        ccpi_match = re.search(rf"{month_match}\s+(\d{{3}}\.\d)", line)
        if not ccpi_match and (i + 1 < len(lines)):
            # Try in the next line
            ccpi_match = re.match(r"(\d{3}\.\d)", lines[i + 1])
            if ccpi_match:
                ccpi_value = ccpi_match.group(1)
                data.append([current_year, month_match, ccpi_value])
                i += 2
                continue
        elif ccpi_match:
            ccpi_value = ccpi_match.group(1)
            data.append([current_year, month_match, ccpi_value])
            i += 1
            continue

    i += 1

# Filter from January 2024 onward
filtered_data = [row for row in data if not (
    int(row[0]) < 2024 or (row[0] == "2024" and months.index(row[1]) < months.index("January"))
)]

# Save to CSV with Date column
if filtered_data:
    df = pd.DataFrame(filtered_data, columns=["Year", "Month", "CCPI"])

    # Create Date column
    month_map = {
        "January": "01", "February": "02", "March": "03", "April": "04",
        "May": "05", "June": "06", "July": "07", "August": "08",
        "September": "09", "October": "10", "November": "11", "December": "12"
    }
    df["Date"] = df.apply(lambda row: f"{row['Year']}/{month_map[row['Month']]}/01", axis=1)

    df.to_csv("cbsl_ccpi.csv", index=False)
    print("\n✅ Extracted Data:")
    print(df.to_string(index=False))
    print("\n✅ Saved to cbsl_ccpi_cleaned.csv")
else:
    print("❌ No valid CCPI data found.")

driver.quit()


Waiting for the page to load...

📄 Processing report: Inflation in March 2025 - CCPI
📎 PDF URL: https://www.cbsl.gov.lk/sites/default/files/cbslweb_documents/press/pr/press_20250328_inflation_in_march_2025_ccpi_e.pdf
✅ PDF downloaded: temp_report.pdf

✅ Extracted Data:
Year     Month  CCPI       Date
2024     March 196.7 2024/03/01
2024     April 195.2 2024/04/01
2024       May 194.1 2024/05/01
2024      June 195.6 2024/06/01
2024      July 194.7 2024/07/01
2024    August 191.1 2024/08/01
2024 September 190.9 2024/09/01
2024   October 189.9 2024/10/01
2024  November 189.4 2024/11/01
2024  December 191.7 2024/12/01
2025   January 192.6 2025/01/01
2025  February 192.2 2025/02/01
2025     March 191.6 2025/03/01

✅ Saved to cbsl_ccpi_cleaned.csv


In [2]:
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# import requests
# import urllib3
# import time
# import fitz  # PyMuPDF
# import pandas as pd
# import os
# import re

# # 🔧 Disable SSL warnings
# urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# # ChromeDriver path
# chrome_driver_path = "C:\\chromedriver-win64\\chromedriver.exe"

# # Set up WebDriver
# options = webdriver.ChromeOptions()
# options.headless = False  # Set True to run in background
# driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

# # Go to CBSL inflation page
# url = "https://www.cbsl.gov.lk/en/measures-of-consumer-price-inflation"
# driver.get(url)
# print("Waiting for the page to load...")
# time.sleep(5)

# # Extract inflation-related links
# print("Extracting links to monthly inflation reports...")
# inflation_links = WebDriverWait(driver, 10).until(
#     EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, "Inflation"))
# )

# # Store extracted data
# data = []

# # Number of reports to extract
# N = 5
# for i in range(min(N, len(inflation_links))):
#     try:
#         # Re-fetch links to avoid stale element
#         inflation_links = WebDriverWait(driver, 10).until(
#             EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, "Inflation"))
#         )
#         link = inflation_links[i]
#         report_title = link.text.strip()
#         print(f"\n📄 Processing report: {report_title}")

#         link.click()
#         time.sleep(3)

#         # Get the correct PDF link
#         pdf_elements = WebDriverWait(driver, 10).until(
#             EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '.pdf')]"))
#         )

#         pdf_url = None
#         for el in pdf_elements:
#             href = el.get_attribute("href")
#             if "inflation" in href.lower() and "ccpi" in href.lower():
#                 pdf_url = href
#                 break

#         if not pdf_url:
#             print("❌ No valid inflation PDF found, skipping.")
#             driver.back()
#             time.sleep(2)
#             continue

#         print(f"📎 PDF URL: {pdf_url}")

#         # Download the PDF
#         pdf_response = requests.get(pdf_url, verify=False)
#         if pdf_response.status_code == 200:
#             local_pdf_path = f"temp_report_{i+1}.pdf"
#             with open(local_pdf_path, 'wb') as f:
#                 f.write(pdf_response.content)
#             print(f"✅ PDF downloaded: {local_pdf_path}")

#             # Extract PDF text
#             doc = fitz.open(local_pdf_path)
#             full_text = ""
#             for page in doc:
#                 full_text += page.get_text()
#             doc.close()
#             os.remove(local_pdf_path)

#             # Extract information
#             extracted_date = report_title
#             index_value = ""

#             # Search for "Index Value" followed by a number like 192.2
#             match = re.search(r'Index Value[^0-9]*([\d]+\.\d+)', full_text)
#             if match:
#                 index_value = match.group(1)
#             else:
#                 # fallback: look for something like 'CCPI (2021=100) recorded 192.2 points'
#                 match_alt = re.search(r'CCPI\s*\(2021=100\)[^\d]*([\d]+\.\d+)', full_text)
#                 if match_alt:
#                     index_value = match_alt.group(1)

#             print(f"📌 Extracted Index Value: {index_value}")
#             data.append([extracted_date, index_value])
#         else:
#             print("❌ Failed to download PDF.")

#         # Back to main page
#         driver.back()
#         time.sleep(2)

#     except Exception as e:
#         print(f"⚠️ Error: {e}")
#         continue

# # Save to CSV
# df = pd.DataFrame(data, columns=["Report Title", "Index Value"])
# csv_filename = "cbsl_index_values.csv"
# df.to_csv(csv_filename, index=False)
# print(f"\n✅ Data saved to: {csv_filename}")

# # Quit browser
# driver.quit()


In [3]:
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# import requests
# import urllib3
# import time
# import fitz  # PyMuPDF
# import pandas as pd
# import os
# import re

# # 🔧 Disable SSL warnings
# urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# # ChromeDriver path
# chrome_driver_path = "C:\\chromedriver-win64\\chromedriver.exe"

# # Set up WebDriver
# options = webdriver.ChromeOptions()
# options.headless = False  # Set True to run in background
# driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

# # Target URL
# base_url = "https://www.cbsl.gov.lk/en/measures-of-consumer-price-inflation"

# # Store extracted data
# data = []

# # Loop control
# processed_titles = set()
# index = 0

# while True:
#     driver.get(base_url)
#     print("\n🔄 Navigated to CBSL inflation page...")
#     time.sleep(5)

#     # Extract updated inflation links
#     inflation_links = WebDriverWait(driver, 10).until(
#         EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, "Inflation"))
#     )

#     # Filter out already processed and < 2024 links
#     links_filtered = []
#     for link in inflation_links:
#         title = link.text.strip()
#         if title not in processed_titles and re.search(r"20(2[4-9]|[3-9][0-9])", title):  # year >= 2024
#             links_filtered.append(link)

#     # Check if done
#     if index >= len(links_filtered):
#         print("\n✅ All 2024+ inflation reports processed.")
#         break

#     try:
#         link = links_filtered[index]
#         report_title = link.text.strip()
#         processed_titles.add(report_title)

#         print(f"\n📄 Processing report: {report_title}")
#         link.click()
#         time.sleep(3)

#         # Get the correct PDF link
#         pdf_elements = WebDriverWait(driver, 10).until(
#             EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '.pdf')]"))
#         )

#         pdf_url = None
#         for el in pdf_elements:
#             href = el.get_attribute("href")
#             if "inflation" in href.lower() and "ccpi" in href.lower():
#                 pdf_url = href
#                 break

#         if not pdf_url:
#             print("❌ No valid inflation PDF found, skipping.")
#             index += 1
#             continue

#         print(f"📎 PDF URL: {pdf_url}")

#         # Download the PDF
#         pdf_response = requests.get(pdf_url, verify=False)
#         if pdf_response.status_code == 200:
#             local_pdf_path = f"temp_report_{index+1}.pdf"
#             with open(local_pdf_path, 'wb') as f:
#                 f.write(pdf_response.content)
#             print(f"✅ PDF downloaded: {local_pdf_path}")

#             # Extract PDF text
#             doc = fitz.open(local_pdf_path)
#             full_text = ""
#             for page in doc:
#                 full_text += page.get_text()
#             doc.close()
#             os.remove(local_pdf_path)

#             # Extract information
#             index_value = ""

#             match = re.search(r'Index Value[^0-9]*([\d]+\.\d+)', full_text)
#             if match:
#                 index_value = match.group(1)
#             else:
#                 match_alt = re.search(r'CCPI\s*\(2021=100\)[^\d]*([\d]+\.\d+)', full_text)
#                 if match_alt:
#                     index_value = match_alt.group(1)

#             print(f"📌 Extracted Index Value: {index_value}")
#             data.append([report_title, index_value])
#         else:
#             print("❌ Failed to download PDF.")

#         index += 1

#     except Exception as e:
#         print(f"⚠️ Error: {e}")
#         index += 1
#         continue

# # Save to CSV
# df = pd.DataFrame(data, columns=["Report Title", "Index Value"])
# csv_filename = "cbsl_index_values.csv"
# df.to_csv(csv_filename, index=False)
# print(f"\n✅ Data saved to: {csv_filename}")

# driver.quit()


In [4]:
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# import requests
# import urllib3
# import time
# import fitz  # PyMuPDF
# import pandas as pd
# import os
# import re

# from pdf2image import convert_from_path
# import pytesseract

# # 🔧 Path to Tesseract
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# # 🔧 Disable SSL warnings
# urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# # WebDriver config
# chrome_driver_path = "C:\\chromedriver-win64\\chromedriver.exe"
# options = webdriver.ChromeOptions()
# options.headless = False
# driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)

# # Visit CBSL page
# url = "https://www.cbsl.gov.lk/en/measures-of-consumer-price-inflation"
# driver.get(url)
# print("Waiting for the page to load...")
# time.sleep(5)

# # Get inflation report links
# inflation_links = WebDriverWait(driver, 10).until(
#     EC.presence_of_all_elements_located((By.PARTIAL_LINK_TEXT, "Inflation"))
# )

# # Use the latest report
# first_link = inflation_links[0]
# print(f"\n📄 Processing report: {first_link.text.strip()}")
# first_link.click()
# time.sleep(3)

# # Find PDF link
# pdf_elements = WebDriverWait(driver, 10).until(
#     EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '.pdf')]"))
# )

# pdf_url = None
# for el in pdf_elements:
#     href = el.get_attribute("href")
#     if "inflation" in href.lower() and "ccpi" in href.lower():
#         pdf_url = href
#         break

# if not pdf_url:
#     print("❌ No PDF link found.")
#     driver.quit()
#     exit()

# print(f"📎 PDF URL: {pdf_url}")

# # Download PDF
# pdf_response = requests.get(pdf_url, verify=False)
# local_pdf_path = "temp_report.pdf"
# if pdf_response.status_code == 200:
#     with open(local_pdf_path, 'wb') as f:
#         f.write(pdf_response.content)
#     print(f"✅ PDF downloaded: {local_pdf_path}")
# else:
#     print("❌ Failed to download PDF.")
#     driver.quit()
#     exit()

# # Read page 2 text
# doc = fitz.open(local_pdf_path)
# page_text = doc[1].get_text() if len(doc) >= 2 else ""
# doc.close()

# if not page_text.strip():
#     print("⚠️ No extractable text. Using OCR...")
#     images = convert_from_path(local_pdf_path, first_page=2, last_page=2)
#     if images:
#         page_text = pytesseract.image_to_string(images[0])

# os.remove(local_pdf_path)

# # print("\n📝 Raw Extracted Text:\n", page_text)

# # Process lines to extract Year, Month, CCPI
# lines = [line.strip() for line in page_text.split('\n') if line.strip()]
# current_year = ""
# data = []
# months = [
#     "January", "February", "March", "April", "May", "June",
#     "July", "August", "September", "October", "November", "December"
# ]

# i = 0
# while i < len(lines):
#     line = lines[i]
#     # Detect Year
#     year_match = re.match(r'20\d{2}', line)
#     if year_match:
#         current_year = year_match.group()
#         i += 1
#         continue

#     # Detect Month + CCPI pattern
#     month_match = next((m for m in months if m in line), None)
#     if current_year and month_match:
#         # Try to find CCPI as a float after month
#         ccpi_match = re.search(rf"{month_match}\s+(\d{{3}}\.\d)", line)
#         if not ccpi_match and (i + 1 < len(lines)):
#             # Try in the next line
#             ccpi_match = re.match(r"(\d{3}\.\d)", lines[i + 1])
#             if ccpi_match:
#                 ccpi_value = ccpi_match.group(1)
#                 data.append([current_year, month_match, ccpi_value])
#                 i += 2
#                 continue
#         elif ccpi_match:
#             ccpi_value = ccpi_match.group(1)
#             data.append([current_year, month_match, ccpi_value])
#             i += 1
#             continue

#     i += 1

# # Save to CSV
# if data:
#     df = pd.DataFrame(data, columns=["Year", "Month", "CCPI"])
#     df.to_csv("cbsl_ccpi_cleaned.csv", index=False)
#     print("\n✅ Extracted Data:")
#     print(df.to_string(index=False))
#     print("\n✅ Saved to cbsl_ccpi.csv")
# else:
#     print("❌ No valid CCPI data found.")

# driver.quit()
