In [1]:
# =============================
# STEP 1 - Install Required Packages
# =============================

!apt-get update -qq
!apt-get install -qq -y wget unzip curl xvfb
!npm install -g lighthouse
!pip install -q selenium==4.15.2 chromedriver-autoinstaller pandas requests beautifulsoup4

# =============================
# STEP 2 - Setup Chrome & Chromedriver
# =============================

# Install Chrome
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb || apt-get -fy install -qq

# Install Chromedriver
import chromedriver_autoinstaller
chromedriver_path = chromedriver_autoinstaller.install()

# Verify installations
!google-chrome --version
!chromedriver --version
!lighthouse --version

# =============================
# STEP 3 - Import Libraries
# =============================

import time
import pandas as pd
import json
import requests
import urllib.parse
import subprocess
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from google.colab import files
from tqdm.notebook import tqdm

# =============================
# STEP 4 - Core Metric Collection Functions
# =============================

def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")

    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    driver.set_page_load_timeout(60)
    return driver

def get_selenium_metrics(url):
    driver = init_driver()
    metrics = {}

    try:
        # Navigation Timing API metrics
        driver.get(url)
        time.sleep(5)  # Allow page to stabilize

        # Core metrics from Performance Timing API
        metrics.update({
            'Response_time_ms': driver.execute_script(
                "return performance.timing.responseEnd - performance.timing.fetchStart"
            ),
            'Load_time_ms': driver.execute_script(
                "return performance.timing.loadEventEnd - performance.timing.navigationStart"
            ),
            'DOM_Content_Loaded_Time_ms': driver.execute_script(
                "return performance.timing.domContentLoadedEventEnd - performance.timing.navigationStart"
            ),
            'First_byte_TTFB_ms': driver.execute_script(
                "return performance.timing.responseStart - performance.timing.requestStart"
            ),
            'Total_links': len(driver.find_elements(By.TAG_NAME, "a"))
        })

        # Resource Timing API metrics
        resources = driver.execute_script("return window.performance.getEntriesByType('resource')")
        metrics.update({
            'No_of_requests': len(resources),
            'Byte_in_bytes': sum(r['transferSize'] for r in resources if 'transferSize' in r),
            'Page_size_MB': round(sum(r['encodedBodySize'] for r in resources if 'encodedBodySize' in r) / (1024 * 1024), 3)
        })

    except Exception as e:
        print(f"Error collecting Selenium metrics: {e}")
    finally:
        driver.quit()

    return metrics

def run_lighthouse(url):
    try:
        # Run Lighthouse via command line
        output_file = "lh_results.json"
        cmd = f"lighthouse {url} --output=json --output-path={output_file} --quiet --chrome-flags='--headless --no-sandbox'"
        subprocess.run(cmd, shell=True, check=True, timeout=120)

        with open(output_file) as f:
            lh_data = json.load(f)

        audits = lh_data['audits']
        categories = lh_data['categories']

        return {
            'Largest_contentful_paint_LCP_ms': audits['largest-contentful-paint']['numericValue'],
            'Cumulative_Layout_Shift_CLS': audits['cumulative-layout-shift']['numericValue'],
            'First_Contentful_Paint_FCP_ms': audits['first-contentful-paint']['numericValue'],
            'Time_to_interactive_TTI_ms': audits['interactive']['numericValue'],
            'Speed_Index_ms': audits['speed-index']['numericValue'],
            'Interaction_to_Next_Paint_INP_ms': audits.get('experimental-interaction-to-next-paint', {}).get('numericValue'),
            'Design_optimization_score': round(categories['performance']['score'] * 100, 1),
            'JavaScript_Execution_Time_ms': audits['mainthread-work-breakdown']['numericValue'],
            'Main_Thread_Work_CPU_ms': audits['total-blocking-time']['numericValue'],
            'CSS_Blocking_Time_ms': audits['render-blocking-resources']['numericValue']
        }
    except Exception as e:
        print(f"Error running Lighthouse: {e}")
        return {}

def get_broken_links(url):
    try:
        response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', href=True)]

        broken = 0
        for link in links[:100]:  # Check first 100 links
            try:
                if not link.startswith('http'):
                    link = urllib.parse.urljoin(url, link)
                resp = requests.head(link, timeout=5, allow_redirects=True)
                if resp.status_code >= 400:
                    broken += 1
            except:
                broken += 1

        return broken
    except:
        return None

# =============================
# STEP 5 - Main Processing Function
# =============================

def collect_all_metrics(url):
    print(f"\nProcessing: {url}")

    # Initialize metrics dictionary
    all_metrics = {'url': url}

    try:
        # Get Selenium metrics
        selenium_data = get_selenium_metrics(url)
        all_metrics.update(selenium_data)

        # Get Lighthouse metrics
        lighthouse_data = run_lighthouse(url)
        all_metrics.update(lighthouse_data)

        # Get broken links count
        all_metrics['Broken_link_count'] = get_broken_links(url)

        # Additional calculated metrics
        all_metrics['Start_render_time_ms'] = all_metrics.get('First_Contentful_Paint_FCP_ms')
        all_metrics['Document_complete_time_ms'] = all_metrics.get('Load_time_ms')

    except Exception as e:
        print(f"Error processing {url}: {e}")
        all_metrics['error'] = str(e)

    return all_metrics

# =============================
# STEP 6 - Process URLs and Save Results
# =============================

print("📂 Upload your CSV file with URLs:")
uploaded = files.upload()

if not uploaded:
    raise ValueError("No file uploaded")

INPUT_CSV = list(uploaded.keys())[0]
OUTPUT_CSV = "website_metrics_complete.csv"

# Read URLs
df = pd.read_csv(INPUT_CSV)
urls = df['url'].tolist()

# Collect metrics
results = []
for url in tqdm(urls):
    if not url.startswith(('http://', 'https://')):
        url = f'https://{url}'
    results.append(collect_all_metrics(url))

# Save to CSV
result_df = pd.DataFrame(results)
result_df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Metrics saved to {OUTPUT_CSV}")
files.download(OUTPUT_CSV)

# Display sample results
print("\nSample Results:")
display(result_df.head())

'apt-get' is not recognized as an internal or external command,
operable program or batch file.
'apt-get' is not recognized as an internal or external command,
operable program or batch file.



added 198 packages in 32s

16 packages are looking for funding
  run `npm fund` for details



[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
'wget' is not recognized as an internal or external command,
operable program or batch file.
'dpkg' is not recognized as an internal or external command,
operable program or batch file.
'apt-get' is not recognized as an internal or external command,
operable program or batch file.
'google-chrome' is not recognized as an internal or external command,
operable program or batch file.


ChromeDriver 139.0.7258.138 (884e54ea8d42947ed636779015c5b4815e069838-refs/branch-heads/7258@{#2631})
12.8.1


ModuleNotFoundError: No module named 'google'