In [16]:
import platform

if platform.system() == "Windows":
    from subprocess import CREATE_NO_WINDOW
else:
    CREATE_NO_WINDOW = None 
import shutil
import os 
import glob
import time
import json 
import chromedriver_autoinstaller_fix
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 


In [17]:
#set paths
user_home = os.path.expanduser("~")

# Define paths based on OS
if platform.system() == "Windows":
    project_path = os.path.join(os.environ['USERPROFILE'], "ING", "NFR", "fraud")
    download_path = os.path.join(os.environ['USERPROFILE'], "OneDrive - ING", "Downloads")
    credentials_path = os.path.join(os.environ['USERPROFILE'], "OneDrive - ING", "Documents", "Python Projects", "Credentials")
    chromedriver_dir = os.path.join(os.environ['USERPROFILE'], "ING", "NFR", "fraud", "Chromedriver")
else:
    project_path = os.path.join(user_home, "Documents", "NFR", "fraud")
    download_path = os.path.join(user_home, "Downloads")
    credentials_path = os.path.join(user_home, "Documents", "NFR", "fraud", "Credentials")
    chromedriver_dir = os.path.join(user_home, "Documents", "NFR", "fraud", "Chromedriver")

# Install or locate chromedriver
active_chromedriver_path = chromedriver_autoinstaller_fix.install(path=chromedriver_dir)


In [18]:
# Print start time
print(100*'-')
print('# Scraper_ORX Start Time & Date: ' + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))

----------------------------------------------------------------------------------------------------
# Scraper_ORX Start Time & Date: 19/09/2025 11:23:26


In [19]:
# Functions
################################################################################################################################################################
def date_diff_months(start_date, end_date):
    """ Returns number of months between the months of two (datetime) dates as whole integer """
    diff = (end_date.year - start_date.year) * 12 + (end_date.month  - start_date.month)
    return diff

def clean_archive(archive_path, file_name, data_retention_period):
    """
        archive_path (str)          = Path-location of archive folder
        data_retention_period (int) = Number of months the file is kept for, older files are removed
    """
    
    # Deletes files older than a certain period in archive folder
    archive_files = os.listdir(archive_path)
        
    # Using Date Modified Timestamps
    archive_files = [os.path.join(archive_path, file) for file in archive_files if file_name in file]
    archive_files = [[datetime.fromtimestamp(os.path.getmtime(file_path)), file_path] for file_path in archive_files] 
    
    # Get files to delete and remove them
    delete_files = [file[1] for file in archive_files if date_diff_months(file[0], datetime.now()) > data_retention_period]
    for file in delete_files:
        os.remove(file)
    return
    
def archive_files(folder_path, file_name):
    """ Moves file to archive folder within the supplied folder """
    # Create archive if not existent
    archive_path = os.path.join(folder_path, 'Archive')
    if not os.path.exists(archive_path):
        os.makedirs(archive_path)
        
    # Move file to archive
    [shutil.move(os.path.join(folder_path, file), os.path.join(archive_path, datetime.fromtimestamp(os.path.getmtime(os.path.join(folder_path, file))).strftime("%Y%m%d_") + file)) for file in os.listdir(folder_path) if file == file_name]     
    return 

def download_wait(directory, timeout=60, nfiles=None):
        """
        Wait for downloads to finish with a specified timeout.
    
        Args
        ----
        directory : str
            The path to the folder where the files will be downloaded.
        timeout : int
            How many seconds to wait until timing out.
        nfiles : int, defaults to None
            If provided, also wait for the expected number of files.
    
        """
        # Wait till download started
        seconds = 0
        download_start = False
        while not download_start and seconds < timeout/2:
            download_start = False
            files = os.listdir(directory)
            for fname in files:
                if fname.endswith('.crdownload'):
                    download_start = True
            
            # Sleep untill next iteration
            time.sleep(1)
            seconds += 1        
        
        # Wait till download finished
        seconds = 0
        download_finish = False
        while not download_finish and seconds < timeout:
            download_finish = True
            files = os.listdir(directory)
            if nfiles and len(files) != nfiles:
                download_finish = False
    
            for fname in files:
                if fname.endswith('.crdownload'):
                    download_finish = False
            
            # Sleep untill next iteration
            time.sleep(1)
            seconds += 1
        return None

In [31]:
def refresh_orx(driver, project_path, credentials_path, download_path):
    """ Refreshes ORX source file inside project folder """

    # Archive & Remove old files
    archive_files(
        folder_path=os.path.join(project_path, 'Data Exports'),
        file_name='ORX export.csv'
    )
    clean_archive(
        archive_path=os.path.join(project_path, 'Data Exports', 'Archive'),
        file_name='ORX export.csv',
        data_retention_period=2
    )

    # Get Credentials for ORX site
    with open(os.path.join(credentials_path, "ORX_credentials.json"), "r") as file:
        creds = json.load(file)

    # Open ORX Site directly at export page
    print('- Opening ORX URL')
    driver.get("https://news.orx.org/search/news")
    time.sleep(2)

    # ✅ Try to accept cookies if banner is present
    print('- Checking for cookie banner')
    try:
        cookie_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept')]"))
        )
        cookie_button.click()
        print('- Cookie button clicked')
    except Exception:
        print('- No cookie banner found, continuing...')

    print(f'- Current URL after cookies: {driver.current_url}')

    # ✅ Check if login is required by inspecting page content
    if driver.find_elements(By.ID, "1-email"):
        print("- Login required, proceeding with login flow")

        # ✅ Fill and submit login form directly
        try:
            print('- Login page loaded, skipping login button click')

            user_box = WebDriverWait(driver, 15).until(
                EC.visibility_of_element_located((By.ID, '1-email'))
            )
            user_box.send_keys(creds.get('user') or creds.get('username'))

            passw_box = driver.find_element(By.ID, '1-password')
            passw_box.send_keys(creds['password'])

            submit_button = driver.find_element(By.XPATH, "//button[@class='auth0-lock-submit']")
            submit_button.click()
            print('- Credentials submitted')
        except Exception as e:
            print('[Login Form Error]', e)
            raise

        # Wait for redirect to ORX content
        try:
            WebDriverWait(driver, 30).until(
                lambda d: "news.orx.org" in d.current_url
            )
            time.sleep(3)
            print(f"- Redirected to ORX news page: {driver.current_url}")
        except Exception as e:
            print('[Redirect Error]', e)
            with open("debug_login_page.html", "w", encoding="utf-8") as f:
                f.write(driver.page_source)
            raise
    else:
        print("- Already logged in, skipping login flow")

    # ✅ Dump post-login page for inspection
    with open("debug_post_login.html", "w", encoding="utf-8") as f:
        f.write(driver.page_source)

    # ✅ Try to click export link safely
    print('- Trying to click export link...')
    try:
        export_link = WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/news/csv')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView(true);", export_link)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", export_link)
        print('- Export link clicked successfully')
    except Exception as e:
        print('- Export link not found or not clickable')
        with open("debug_export_click_error.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        return

    # ✅ Wait for download to complete
    print('- Waiting for download to complete...')
    try:
        download_wait(download_path, 60)  # Wait up to 60 seconds
    except Exception as e:
        print('- Download did not complete in time.')
        with open("debug_download_folder.txt", "w") as f:
            f.write("\n".join(glob.glob(os.path.join(download_path, '*'))))
        return

    # ✅ Move downloaded file to project folder
    print('- Copying and moving file')
    list_of_files = glob.glob(os.path.join(download_path, '*export.csv'))
    if not list_of_files:
        print('- No CSV file found in download folder.')
        with open("debug_download_folder.txt", "w") as f:
            f.write("\n".join(list_of_files))
        return

    latest_file = max(list_of_files, key=os.path.getctime)
    shutil.move(latest_file, os.path.join(project_path, "Data Exports", "ORX export.csv"))
    print('- File moved successfully')



In [32]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
from selenium.webdriver.support import expected_conditions as EC

# Define your paths
#set paths
user_home = os.path.expanduser("~")

# Define paths based on OS
if platform.system() == "Windows":
    project_path = os.path.join(os.environ['USERPROFILE'], "ING", "NFR", "fraud")
    download_path = os.path.join(os.environ['USERPROFILE'], "OneDrive - ING", "Downloads")
    credentials_path = os.path.join(os.environ['USERPROFILE'], "OneDrive - ING", "Documents", "Python Projects", "Credentials")
    chromedriver_dir = os.path.join(os.environ['USERPROFILE'], "ING", "NFR", "fraud", "Chromedriver")
else:
    project_path = os.path.join(user_home, "Documents", "NFR", "fraud")
    download_path = os.path.join(user_home, "Downloads")
    credentials_path = os.path.join(user_home, "Documents", "NFR", "fraud", "Credentials")
    chromedriver_dir = os.path.join(user_home, "Documents", "NFR", "fraud", "Chromedriver")
    
print('- Opening Chromedriver')

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-search-engine-choice-screen")
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_path,
    "download.prompt_for_download": False,
    "profile.default_content_settings.popups": 0,
    "directory_upgrade": True
})

prefs = {
    "download.default_directory": download_path,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
}
chrome_options.add_experimental_option("prefs", prefs)


# Use webdriver-manager to auto-install the correct ChromeDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
driver.maximize_window()

# Run ORX Scraper
print('# Scraper_ORX Start Time & Date: ' + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
print(100 * '-')

try:
    refresh_orx(driver, project_path, credentials_path, download_path)
except Exception as error:
    print("- Scraper failed please check:")
    print(error)
finally:
    print("- Closing Chromedriver")
    try:
        driver.quit()
    except Exception as quit_error:
        print("- Error while closing Chromedriver:", quit_error)
        driver = None

    print('# Scraper_ORX End Time & Date: ' + datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    print(100 * '-')



- Opening Chromedriver
# Scraper_ORX Start Time & Date: 19/09/2025 14:09:48
----------------------------------------------------------------------------------------------------
- Opening ORX URL
- Checking for cookie banner
- No cookie banner found, continuing...
- Current URL after cookies: https://orx.eu.auth0.com/login?state=hKFo2SBFM3RWUW4xU3hOUmlSZUQ5d3ZYUXZoWkVMRVowY0ZCOaFupWxvZ2luo3RpZNkgLWVjUzJyTVVlT0NFRnhyeVh5SjVHM3N1dS05Z3FqdmujY2lk2SB3dXBORFRaM21vM3hpc0wwckRhb1JMSXRJVnNCODl6NQ&client=wupNDTZ3mo3xisL0rDaoRLItIVsB89z5&protocol=samlp&SAMLRequest=fVNdj5swEHzPr4h4DzhwuYCVRMqFfiClJEroqbqXyjXLnSWwqdc0XH99DeSatGrjF4v1zHh2vCyQVWVN1415kQf43gCa0diutiol0v5w6TRaUsVQIJWsAqSG0%2BP605b6LqG1VkZxVTp%2F0W6zGCJoI5QcaEm8dHbpu%2B3uQ5J%2BLULiz0KfRLP7YD67m4fBLCRzDtOC%2BxFh4X1A8sAns4H6CBqtztKxss5oUENsIJFomDS2btUmJJpMo2zqUxLRu%2BhpoMa2WSGZ6ekvxtRIPU%2Fp1oXGZTYQ4nJVeX0G3qmp0zh7CioVtAK3RMdMHbaJSR7xIYx%2Bns3sz2E8CJkL%2BXw7g28DCOnHLNtP9rtjNois37LZKIlNBfoI%2Bofg8PmwvdiUcEK386r0c2%2FRYxydVS%2Bw6L5pn4Je7Q5f