In [None]:
import csv
import logging
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException, NoSuchFrameException
from urllib.parse import urlparse
import concurrent.futures

logging.basicConfig(level=logging.INFO)

def validity_check(url):
    try:
        output = urlparse(url)
        return all([output.scheme, output.netloc])
    except ValueError:
        return False

def configure_browser_options(headless=True):
    options = Options()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-images")
    options.page_load_strategy = 'eager'
    return options

def handle_cookies(driver):
    cookie_buttons = [
        "Accept all & visit the site", "Accept", "Agree", "Close", "Ignore",
        "Agree & Continue", "Agree & Close", "AGREE & CLOSE", "CONSENT",
        "Consent", "ACCEPT","AGREE","Accept all cookies","Accept All Cookies","Allow","ALLOW","OK","Ok","I accept","I Accept","I ACCEPT",
        "Accept Cookies","ACCEPT COOKIES","ALLOW ALL COOKIES"
    ]
    for button_text in cookie_buttons:
        try:
            accept_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, f"//button[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')] | //a[contains(translate(text(),'ACCEPT','accept'),'{button_text.lower()}')]"))
            )
            accept_button.click()
            break
        except (NoSuchElementException, TimeoutException, ElementClickInterceptedException):
            pass

def find_login_button(driver):
    login_section = ["Login", "Log in", "Sign in", "Sign In", "Log In", "LOGON", "SIGN IN", "LOGIN", "LOG IN","Login/Register"]
    for login_pat in login_section:
        try:
            login_button = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.PARTIAL_LINK_TEXT, login_pat))
            )
            actions = ActionChains(driver)
            actions.move_to_element(login_button).perform()
            try:
                login_button.click()
            except ElementClickInterceptedException:
                driver.execute_script("arguments[0].click();", login_button)
            return True
        except TimeoutException:
            continue
    return False

def switch_to_iframe(driver):
    try:
        WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.TAG_NAME, "iframe")))
        logging.info("Switched to iframe")
    except TimeoutException:
        logging.warning("No iframe found")

def check_google_oidc(driver, website_url):
    google_oidc_patterns = [
        "Sign in with Google", "Continue with Google", "Login with Google",
        "Sign in using Google", "Log In with Google", "Login using Google",
        "LOG IN using Google", "LOG IN with Google", "LOG IN WITH GOOGLE","Sign In with Google"
    ]

    oidc_method = "N/A"
    redirect_or_element_found = False
    text_pattern_found = False

    try:
        WebDriverWait(driver, 10).until(
            lambda x: driver.current_url != website_url or driver.find_element(By.XPATH, "//a[contains(@href,'google')] | //button[contains(@href,'google')] | //div[contains(@href,'google')]")
        )
        redirect_or_element_found = True
    except TimeoutException:
        pass

    if any(pattern in driver.page_source for pattern in google_oidc_patterns):
        text_pattern_found = True

    if redirect_or_element_found:
        if len(driver.window_handles) > 1:
            oidc_method = "Pop-up"
        elif driver.current_url != website_url:
            oidc_method = "Redirect URL"
        else:
            oidc_method = "Direct Link/Button"
    elif text_pattern_found:
        oidc_method = "Text Pattern"

    has_google_oidc = oidc_method != "N/A"
    return has_google_oidc, oidc_method

def check_oidc(website_url, headless=True):
    try:
        driver = webdriver.Chrome(options=configure_browser_options(headless))

        if not validity_check(website_url):
            logging.error(f"Invalid URL: {website_url}")
            return False, "N/A"

        driver.get(website_url)
        handle_cookies(driver)
        if not find_login_button(driver):
            logging.warning("Login button not found")
        switch_to_iframe(driver)

        has_google_oidc, method = check_google_oidc(driver, website_url)

        #logging.info(f"Google OIDC check completed for {website_url}: {has_google_oidc}, Method: {method}")

        driver.quit()
        return has_google_oidc, method

    except Exception as e:
        logging.error(f"Error analyzing {website_url}: {e}")
        return False, "Error"

def check_oidc_concurrent(website_url):
    return (website_url, *check_oidc(website_url))

def main(websites):
    num_threads = 10  # Adjust based on your requirements and capabilities

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        future_to_url = {executor.submit(check_oidc_concurrent, url): url for url in websites}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                website, uses_google_oidc, method = future.result()
                print(f"{website} uses Google OIDC: {uses_google_oidc}, Method: {method}")
            except Exception as exc:
                print(f"{url} generated an exception: {exc}")

if __name__ == "__main__":
    websites = [
'https://www.shopify.com/','https://www.w3schools.com/','https://medium.com/','https://www.researchgate.net/',
    'https://vimeo.com/','https://pixabay.com/re','https://www.chess.com/','https://indianexpress.com/',
    'https://www.fandom.com/','https://www.aliexpress.com/','https://www.imdb.com/','https://www.doordash.com/',
    'https://www.studocu.com/en-gb','https://shopee.ph/','https://www.veed.io/','https://www.coursehero.com/',
    'https://www.marketwatch.com/','https://cloudinary.com/','https://www.crazygames.com/','https://groww.in/',
    'https://www.mobile.de/','https://www.vitalsource.com/','https://www.gitbook.com/','https://disqus.com/',
    'https://www.mongodb.com/','https://apkpure.com/','https://brainly.com/','https://www.duplichecker.com/',
    'https://www.bhphotovideo.com/','https://www.blizzard.com/en-gb/','https://www.fotor.com/','https://www.abc.net.au/',
    'https://www.repubblica.it/','https://www.pdf2go.com/','https://www.drive2.ru/','https://www.cbssports.com/',
    'https://www.turnitin.com/','https://www.mirror.co.uk/','https://tvtropes.org/','https://www.skyscanner.net/',
    'https://www.hindustantimes.com/','https://www.imdb.com/'
    ]
    main(websites)    

# Read URLs from CSV file
#websites = []
#with open('websites.csv', 'r') as file:
#    reader = csv.reader(file)
#    for row in reader:
#        websites.extend(row)

# Perform checks and save results to CSV
#outputs = {}
#for website in websites:
#    uses_google_oidc, method = check_oidc(website)
#    outputs[website] = (uses_google_oidc, method)

#with open('output.csv', 'w', newline='') as csvfile:
#    writer = csv.writer(csvfile)
#    writer.writerow(['Website', 'Google OIDC Present', 'Method'])
#    for website, (oidc_present, method) in outputs.items():
#        writer.writerow([website, oidc_present, method])
        
#websites = [
#'https://www.shopify.com/','https://www.w3schools.com/','https://medium.com/','https://www.researchgate.net/',
#    'https://vimeo.com/','https://pixabay.com/re','https://www.chess.com/','https://indianexpress.com/',
#    'https://www.fandom.com/','https://www.aliexpress.com/','https://www.imdb.com/','https://www.doordash.com/',
#    'https://www.studocu.com/en-gb','https://shopee.ph/','https://www.veed.io/','https://www.coursehero.com/',
#    'https://www.marketwatch.com/','https://cloudinary.com/','https://www.crazygames.com/','https://groww.in/',
#    'https://www.mobile.de/','https://www.vitalsource.com/','https://www.gitbook.com/','https://disqus.com/',
#    'https://www.mongodb.com/','https://apkpure.com/','https://brainly.com/','https://www.duplichecker.com/',
#    'https://www.bhphotovideo.com/','https://www.blizzard.com/en-gb/','https://www.fotor.com/','https://www.abc.net.au/',
#    'https://www.repubblica.it/','https://www.pdf2go.com/','https://www.drive2.ru/','https://www.cbssports.com/',
#    'https://www.turnitin.com/','https://www.mirror.co.uk/','https://tvtropes.org/','https://www.skyscanner.net/',
#    'https://www.hindustantimes.com/','https://www.imdb.com/'
]

for website in websites:
    uses_google_oidc, method = check_oidc(website)
    print(f"{website} uses Google OIDC: {uses_google_oidc}, Method: {method}")

INFO:root:Switched to iframe


https://www.shopify.com/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://www.w3schools.com/ uses Google OIDC: True, Method: Redirect URL




https://medium.com/ uses Google OIDC: True, Method: Direct Link/Button


INFO:root:Switched to iframe


https://www.researchgate.net/ uses Google OIDC: True, Method: Redirect URL


INFO:root:Switched to iframe


https://vimeo.com/ uses Google OIDC: True, Method: Redirect URL


INFO:root:Switched to iframe


https://pixabay.com/re uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://www.chess.com/ uses Google OIDC: True, Method: Redirect URL


INFO:root:Switched to iframe


https://indianexpress.com/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://www.fandom.com/ uses Google OIDC: False, Method: N/A




https://www.aliexpress.com/ uses Google OIDC: True, Method: Redirect URL




https://www.imdb.com/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://www.doordash.com/ uses Google OIDC: False, Method: N/A


ERROR:root:Error analyzing https://www.studocu.com/en-gb: Message: invalid argument: missing 'ELEMENT'
  (Session info: headless chrome=119.0.6045.124)
Stacktrace:
	GetHandleVerifier [0x00007FF65ED882B2+55298]
	(No symbol) [0x00007FF65ECF5E02]
	(No symbol) [0x00007FF65EBB05AB]
	(No symbol) [0x00007FF65EC2C079]
	(No symbol) [0x00007FF65EC120AA]
	(No symbol) [0x00007FF65EC2AAA4]
	(No symbol) [0x00007FF65EC11E83]
	(No symbol) [0x00007FF65EBE670A]
	(No symbol) [0x00007FF65EBE7964]
	GetHandleVerifier [0x00007FF65F100AAB+3694587]
	GetHandleVerifier [0x00007FF65F15728E+4048862]
	GetHandleVerifier [0x00007FF65F14F173+4015811]
	GetHandleVerifier [0x00007FF65EE247D6+695590]
	(No symbol) [0x00007FF65ED00CE8]
	(No symbol) [0x00007FF65ECFCF34]
	(No symbol) [0x00007FF65ECFD062]
	(No symbol) [0x00007FF65ECED3A3]
	BaseThreadInitThunk [0x00007FFCF87D257D+29]
	RtlUserThreadStart [0x00007FFCF916AA58+40]



https://www.studocu.com/en-gb uses Google OIDC: False, Method: Error


INFO:root:Switched to iframe


https://shopee.ph/ uses Google OIDC: True, Method: Redirect URL


INFO:root:Switched to iframe


https://www.veed.io/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://www.coursehero.com/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://www.marketwatch.com/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://cloudinary.com/ uses Google OIDC: True, Method: Redirect URL


INFO:root:Switched to iframe


https://www.crazygames.com/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://groww.in/ uses Google OIDC: False, Method: N/A




https://www.mobile.de/ uses Google OIDC: False, Method: N/A


INFO:root:Switched to iframe


https://www.vitalsource.com/ uses Google OIDC: False, Method: N/A
