<a href="https://colab.research.google.com/github/michaelwnau/consequential-products/blob/main/verify_links.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
import pandas as pd
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

df = pd.read_csv('/content/drive/MyDrive/directus-automations/directus-links-audit.csv')

In [None]:
# Function to set up a Selenium WebDriver
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [None]:
# Function to verify links and check for authentication requirements
def verify_links_with_auth(df, column_name, driver):
    """
    Verifies links in a specified column of a pandas DataFrame, checks if authentication is required,
    and captures thumbnails for each link.

    Args:
        df (pd.DataFrame): The DataFrame containing the links.
        column_name (str): The name of the column containing the links.
        driver: Selenium WebDriver instance.

    Returns:
        pd.DataFrame: The DataFrame with new columns 'Status' and 'Authentication Required'.
    """

    # Initialize new columns
    df['Status'] = ''
    df['Authentication Required'] = ''
    thumbnail_dir = '/content/drive/MyDrive/directus-automations/thumbnails'
    os.makedirs(thumbnail_dir, exist_ok=True)

    thumbnails = []

    for index, row in df.iterrows():
        url = row[column_name]
        try:
            # Send a GET request to the URL
            response = requests.get(url, timeout=10)

            if response.status_code == 200:
                df.at[index, 'Status'] = 'OK'
                # Parse HTML with BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
                # Check for authentication-related elements
                if soup.find('form', {'name': 'login'}) or 'login' in url.lower():
                    df.at[index, 'Authentication Required'] = 'Y'
                else:
                    df.at[index, 'Authentication Required'] = 'N'

                # Capture a thumbnail using Selenium
                driver.get(url)
                row_number = index + 1  # Adjust row number to start from 1
                thumbnail_path = os.path.join(thumbnail_dir, f"thumbnail_{row_number}.png")
                driver.save_screenshot(thumbnail_path)
                thumbnails.append(thumbnail_path)
            else:
                df.at[index, 'Status'] = f'Error: {response.status_code}'
                df.at[index, 'Authentication Required'] = 'N'
                thumbnails.append(None)
        except Exception as e:
            df.at[index, 'Status'] = 'Error: Connection failed'
            df.at[index, 'Authentication Required'] = 'N'
            thumbnails.append(None)

    df['Thumbnail Path'] = thumbnails
    return df

# Main execution
if __name__ == '__main__':
    # File path to the CSV file in Google Drive
    file_path = '/content/drive/MyDrive/directus-automations/directus-links-audit.csv'

    # Load the CSV file
    df = pd.read_csv(file_path)

    # Print column names to verify the correct column is selected
    print("Column names in the CSV file:", df.columns)

    # Set up Selenium WebDriver
    driver = setup_driver()

    # Verify the links in the 'url' column and capture thumbnails
    df = verify_links_with_auth(df, 'url', driver)

    # Save the updated DataFrame back to Google Drive
    output_path = '/content/drive/MyDrive/Directus_Links_Audit_Results.xlsx'
    with pd.ExcelWriter(output_path) as writer:
        # Write main DataFrame to the first sheet
        df.to_excel(writer, sheet_name='Verification Results', index=False)

        # Add thumbnail paths to a new sheet
        thumbnails_df = pd.DataFrame({'Row Number': df.index + 1, 'URL': df['url'], 'Thumbnail Path': df['Thumbnail Path']})
        thumbnails_df.to_excel(writer, sheet_name='Thumbnails', index=False)

    # Close the WebDriver
    driver.quit()

    # Print confirmation and display a sample of the updated DataFrame
    print("Verification complete. Updated spreadsheet saved to Google Drive.")
    print(df.head())