In [None]:
import threading
import csv
import logging
import concurrent.futures
from selenium import webdriver
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException, NoSuchFrameException
from urllib.parse import urlparse

logging.basicConfig(level=logging.INFO)

def validity_check(url):
    try:
        output = urlparse(url)
        return all([output.scheme, output.netloc])
    except ValueError:
        return False

def configure_browser_options(headless=True):
    options = Options()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-images")
    options.page_load_strategy = 'eager'
    return options

def handle_cookies(driver):
    cookie_buttons = [
        "Accept all & visit the site", "Accept", "Agree", "Close", "Ignore",
        "Agree & Continue", "Agree & Close", "AGREE & CLOSE", "CONSENT",
        "Consent", "ACCEPT","AGREE","Accept all cookies","Accept All Cookies","Allow","ALLOW","OK","Ok","I accept","I Accept","I ACCEPT",
        "Accept Cookies","ACCEPT COOKIES","Allow all cookies"
    ]
    for button_text in cookie_buttons:
        try:
            accept_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, f"//button[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')] | //a[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')]"))
            )
            accept_button.click()
            break
        except (NoSuchElementException, TimeoutException, ElementClickInterceptedException):
            pass

def find_login_button(driver):
    login_section = ["Login", "Log in", "Sign in", "Sign In", "Log In", "LOGON", "SIGN IN", "LOGIN", "LOG IN"]
    for login_pat in login_section:
        try:
            login_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, login_pat))
            )
            actions = ActionChains(driver)
            actions.move_to_element(login_button).perform()
            try:
                login_button.click()
            except ElementClickInterceptedException:
                driver.execute_script("arguments[0].click();", login_button)
            return True
        except TimeoutException:
            continue
    return False

def switch_to_iframe(driver):
    try:
        WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.TAG_NAME, "iframe")))
    except TimeoutException:
        pass

def check_google_oidc(driver, website_url):
    google_oidc_patterns = [
        "Sign in with Google", "Continue with Google", "Login with Google",
        "Sign in using Google", "Log In with Google", "Login using Google",
        "LOG IN using Google", "LOG IN with Google", "LOG IN WITH GOOGLE"
    ]

    oidc_method = "N/A"
    redirect_or_element_found = False
    text_pattern_found = False

    try:
        WebDriverWait(driver, 10).until(
            lambda x: driver.current_url != website_url or driver.find_element(By.XPATH, "//a[contains(@href,'google')] | //button[contains(@href,'google')] | //div[contains(@href,'google')]")
        )
        redirect_or_element_found = True
    except TimeoutException:
        pass

    if any(pattern in driver.page_source for pattern in google_oidc_patterns):
        text_pattern_found = True

    if redirect_or_element_found:
        if len(driver.window_handles) > 1:
            oidc_method = "Pop-up"
        elif driver.current_url != website_url:
            oidc_method = "Redirect URL"
        else:
            oidc_method = "Direct Link/Button"
    elif text_pattern_found:
        oidc_method = "Text Pattern"

    has_google_oidc = oidc_method != "N/A"
    return has_google_oidc, oidc_method

def check_oidc(website_url, headless=True):
    try:
        driver = webdriver.Chrome(options=configure_browser_options(headless))

        if not validity_check(website_url):
            logging.error(f"Invalid URL: {website_url}")
            return False, "N/A"

        driver.get(website_url)
        handle_cookies(driver)
        if not find_login_button(driver):
            logging.warning("Login button not found")
        switch_to_iframe(driver)

        has_google_oidc, method = check_google_oidc(driver, website_url)

        driver.quit()
        return has_google_oidc, method

    except Exception as e:
        logging.error(f"Error analyzing {website_url}: {e}")
        return False, "Error"

def check_oidc_thread(website, results, lock):
    uses_google_oidc, method = check_oidc(website)
    with lock:
        results.append((website, uses_google_oidc, method))

def main():
    websites = [
        'https://www.shopify.com/','https://www.w3schools.com/','https://medium.com/','https://www.researchgate.net/',
    'https://vimeo.com/','https://pixabay.com/re','https://www.chess.com/','https://indianexpress.com/',
    'https://www.fandom.com/','https://www.aliexpress.com/','https://www.imdb.com/','https://www.doordash.com/',
    'https://www.studocu.com/en-gb','https://shopee.ph/','https://www.veed.io/','https://www.coursehero.com/',
    'https://www.marketwatch.com/','https://cloudinary.com/','https://www.crazygames.com/','https://groww.in/',
    'https://www.mobile.de/','https://www.vitalsource.com/','https://www.gitbook.com/','https://disqus.com/',
    'https://www.mongodb.com/','https://apkpure.com/','https://brainly.com/','https://www.duplichecker.com/',
    'https://www.bhphotovideo.com/','https://www.blizzard.com/en-gb/','https://www.fotor.com/','https://www.abc.net.au/',
    'https://www.repubblica.it/','https://www.pdf2go.com/','https://www.drive2.ru/','https://www.cbssports.com/',
    'https://www.turnitin.com/','https://www.mirror.co.uk/','https://tvtropes.org/','https://www.skyscanner.net/',
    'https://www.hindustantimes.com/','https://www.imdb.com/' #... other websites
    ]

    max_threads = 6  # Maximum number of concurrent threads
    results = []
    lock = threading.Lock()

    for website in websites:
        thread = threading.Thread(target=check_oidc_thread, args=(website, results, lock))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    for website, uses_google_oidc, method in results:
        print(f"{website} uses Google OIDC: {uses_google_oidc}, Method: {method}")

if __name__ == "__main__":
    main()




In [1]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

def validity_check(url):
    try:
        output = urlparse(url)
        return all([output.scheme, output.netloc])
    except ValueError:
        return False
    
def configure_browser():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    return webdriver.Chrome(options=chrome_options)

def check_oidc(website_url):
    driver = webdriver.Chrome()

    if not validity_check(website_url):
        print(f"Invalid URL: {website_url}")
        return website_url, False, "N/A"

    driver.get(website_url)

    # Accept cookies
    cookie_buttons = [
        "Accept all & visit the site", "Accept", "Agree", "Close", "Ignore", 
        "Agree & Continue", "Agree & Close", "AGREE & CLOSE", "CONSENT", 
        "Consent", "ACCEPT","AGREE","Accept all cookies","Accept All Cookies","Allow","ALLOW","OK","Ok","I accept","I Accept","I ACCEPT"
    ]
    for button_text in cookie_buttons:
        try:
            accept_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, f"//button[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')] | //a[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')]"))
            )
            accept_button.click()
            break
        except (NoSuchElementException, TimeoutException, ElementClickInterceptedException):
            pass

    # Look for login section
    login_section = ["Login", "Log in", "Sign in", "Sign In", "Log In", "LOGON", "SIGN IN", "LOGIN", "LOG IN","Login/Register"]
    for login_pat in login_section:
        try:
            login_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, login_pat))
            )
            actions = ActionChains(driver)
            actions.move_to_element(login_button).perform()
            try:
                login_button.click()
            except ElementClickInterceptedException:
                driver.execute_script("arguments[0].click();", login_button)
            break
        except TimeoutException:
            pass

    # Google OIDC patterns and checks
    google_oidc_patterns = [
        "Sign in with Google", "Continue with Google","Sign In with Google", "Login with Google", 
        "Sign in using Google", "Log In with Google", "Login using Google",
        "LOG IN using Google", "LOG IN with Google", "LOG IN WITH GOOGLE"
    ]

    oidc_method = "N/A"
    redirect_or_element_found = False
    text_pattern_found = False

    try:
        # Wait for redirection or specific elements
        WebDriverWait(driver, 10).until(
            lambda x: driver.current_url != website_url or driver.find_element(By.XPATH, "//a[contains(@href,'google')] | //button[contains(@href,'google')] | //div[contains(@href,'google')]")
        )
        redirect_or_element_found = True
    except TimeoutException:
        pass

    # Check for text patterns
    if any(pattern in driver.page_source for pattern in google_oidc_patterns):
        text_pattern_found = True

    if redirect_or_element_found:
        if len(driver.window_handles) > 1:
            oidc_method = "Pop-up"
        elif driver.current_url != website_url:
            oidc_method = "Redirect URL"
        else:
            oidc_method = "Direct Link/Button"
    elif text_pattern_found:
        oidc_method = "Text Pattern"

    has_google_oidc = oidc_method != "N/A"
    driver.quit()
    return website_url, has_google_oidc, oidc_method

def process_websites(websites, max_threads=5):
        results = []
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {executor.submit(check_oidc, website): website for website in websites}

        for future in as_completed(futures):
            result = future.result()
            results.append(result)
    return results

def main():
    websites_to_test = [
        'https://www.shopify.com/','https://www.w3schools.com/','https://medium.com/','https://www.researchgate.net/',
    'https://vimeo.com/','https://pixabay.com/re','https://www.chess.com/','https://indianexpress.com/',
    'https://www.fandom.com/','https://www.aliexpress.com/','https://www.imdb.com/','https://www.doordash.com/',
    'https://www.studocu.com/en-gb','https://shopee.ph/','https://www.veed.io/','https://www.coursehero.com/',
    'https://www.marketwatch.com/','https://cloudinary.com/','https://www.crazygames.com/','https://groww.in/',
    'https://www.mobile.de/','https://www.vitalsource.com/','https://www.gitbook.com/','https://disqus.com/',
    'https://www.mongodb.com/','https://apkpure.com/','https://brainly.com/','https://www.duplichecker.com/',
    'https://www.bhphotovideo.com/','https://www.blizzard.com/en-gb/','https://www.fotor.com/','https://www.abc.net.au/',
    'https://www.repubblica.it/','https://www.pdf2go.com/','https://www.drive2.ru/','https://www.cbssports.com/',
    'https://www.turnitin.com/','https://www.mirror.co.uk/','https://tvtropes.org/','https://www.skyscanner.net/',
    'https://www.hindustantimes.com/','https://www.imdb.com/'
    ]

    results = process_websites(websites_to_test)

    # Printing results
    for website, uses_google_oidc, method in results:
        print(f"{website} uses Google OIDC: {uses_google_oidc}, Method: {method}")

    # CSV output (if needed)
    # [Your CSV writing code]

if __name__ == "__main__":
    main()


https://www.shopify.com/ uses Google OIDC: True, Method: Redirect URL
https://vimeo.com/ uses Google OIDC: True, Method: Redirect URL
https://www.w3schools.com/ uses Google OIDC: True, Method: Redirect URL
https://www.researchgate.net/ uses Google OIDC: True, Method: Redirect URL
https://medium.com/ uses Google OIDC: True, Method: Direct Link/Button
https://www.aliexpress.com/ uses Google OIDC: True, Method: Redirect URL
https://www.chess.com/ uses Google OIDC: True, Method: Redirect URL
https://pixabay.com/re uses Google OIDC: False, Method: N/A
https://indianexpress.com/ uses Google OIDC: True, Method: Pop-up
https://www.fandom.com/ uses Google OIDC: True, Method: Redirect URL
https://www.veed.io/ uses Google OIDC: True, Method: Redirect URL
https://shopee.ph/ uses Google OIDC: True, Method: Redirect URL
https://www.imdb.com/ uses Google OIDC: False, Method: N/A
https://www.doordash.com/ uses Google OIDC: False, Method: N/A
https://www.studocu.com/en-gb uses Google OIDC: False, Metho

# FROM CSV

In [14]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

def validity_check(url):
    try:
        output = urlparse(url)
        return all([output.scheme, output.netloc])
    except ValueError:
        return False
    
def configure_browser():
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    return webdriver.Chrome(options=chrome_options)

def check_oidc(website_url):
    driver = webdriver.Chrome()

    if not validity_check(website_url):
        print(f"Invalid URL: {website_url}")
        return website_url, False, "N/A"

    driver.get(website_url)

    # Accept cookies
    cookie_buttons = [
        "Accept all & visit the site", "Accept", "Agree", "Close", "Ignore", 
        "Agree & Continue", "Agree & Close", "AGREE & CLOSE", "CONSENT", 
        "Consent", "ACCEPT","AGREE","Accept all cookies","Accept All Cookies","Allow","ALLOW","OK","Ok","I accept","I Accept","I ACCEPT"
    ]
    for button_text in cookie_buttons:
        try:
            accept_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, f"//button[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')] | //a[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')]"))
            )
            accept_button.click()
            break
        except (NoSuchElementException, TimeoutException, ElementClickInterceptedException):
            pass

    # Look for login section
    login_section = ["Login", "Log in", "Sign in", "Sign In", "Log In", "LOGON", "SIGN IN", "LOGIN", "LOG IN","Login/Register"]
    for login_pat in login_section:
        try:
            login_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, login_pat))
            )
            actions = ActionChains(driver)
            actions.move_to_element(login_button).perform()
            try:
                login_button.click()
            except ElementClickInterceptedException:
                driver.execute_script("arguments[0].click();", login_button)
            break
        except TimeoutException:
            pass

    # Google OIDC patterns and checks
    google_oidc_patterns = [
        "Sign in with Google", "Continue with Google","Sign In with Google", "Login with Google", 
        "Sign in using Google", "Log In with Google", "Login using Google",
        "LOG IN using Google", "LOG IN with Google", "LOG IN WITH GOOGLE"
    ]

    oidc_method = "N/A"
    redirect_or_element_found = False
    text_pattern_found = False

    try:
        # Wait for redirection or specific elements
        WebDriverWait(driver, 10).until(
            lambda x: driver.current_url != website_url or driver.find_element(By.XPATH, "//a[contains(@href,'google')] | //button[contains(@href,'google')] | //div[contains(@href,'google')]")
        )
        redirect_or_element_found = True
    except TimeoutException:
        pass

    # Check for text patterns
    if any(pattern in driver.page_source for pattern in google_oidc_patterns):
        text_pattern_found = True

    if redirect_or_element_found:
        if len(driver.window_handles) > 1:
            oidc_method = "Pop-up"
        elif driver.current_url != website_url:
            oidc_method = "Redirect URL"
        else:
            oidc_method = "Direct Link/Button"
    elif text_pattern_found:
        oidc_method = "Text Pattern"

    has_google_oidc = oidc_method != "N/A"
    driver.quit()
    return website_url, has_google_oidc, oidc_method

def process_websites(websites, max_threads=5):
    results = []
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = {executor.submit(check_oidc, website): website for website in websites}

        for future in as_completed(futures):
            result = future.result()
            results.append(result)
    return results

def read_websites_from_csv(input_csv):
    websites = []
    with open(input_csv, 'r', newline='', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            websites.append(row[0])  # Assuming URLs are in the first column
    return websites

def write_results_to_csv(results, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerow(['Website', 'Google OIDC Present', 'Method'])  # Header
        for website, uses_google_oidc, method in results:
            csv_writer.writerow([website, uses_google_oidc, method])

def main():
    input_csv = '3600-5000_Nano.csv'  # Replace with your input CSV file name
    output_csv = '3600-5000_Nano_results.csv'  # Replace with your desired output CSV file name

    websites_to_test = read_websites_from_csv(input_csv)
    results = process_websites(websites_to_test, max_threads=10)
    write_results_to_csv(results, output_csv)

if __name__ == "__main__":
    main()

FileNotFoundError: [Errno 2] No such file or directory: '3600-5000_Nano.csv'