**Import Government Data**

In [50]:
import pandas as pd
gov_websites = pd.read_csv("https://raw.githubusercontent.com/cisagov/dotgov-data/main/current-full.csv")
gov_websites['URL'] = 'http://' + gov_websites['Domain name']


**Define Function for Grabbing Robots.Txt**

In [None]:
import urllib.robotparser
import urllib.request
import socket
from http.client import RemoteDisconnected

def check_robots_txt(url):
    rp = urllib.robotparser.RobotFileParser()
    robots_txt_url = f"{url}/robots.txt"

    try:
        # Set the timeout for the request
        with urllib.request.urlopen(robots_txt_url, timeout=10) as response:
            if response.status != 200:
                return (False, [])

        rp.set_url(robots_txt_url)
        rp.read()

        user_agents = set()

        for entry in rp.entries:
            for useragent in entry.useragents:
                user_agents.add(useragent)

        return (True, list(user_agents))

    except (urllib.error.URLError, socket.timeout, RemoteDisconnected) as e:
        print(f"Error accessing {robots_txt_url}: {e}")
        return (False, [])



**Multithreaded Implementation for Speed**

In [None]:

from concurrent.futures import ThreadPoolExecutor, as_completed

def process_urls_multithreaded(df):
    urls = df['URL'].tolist()
    results = [None] * len(urls)

    def worker(url, index):
        results[index] = check_robots_txt(url)

    # Create a ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=20) as executor:
        # Submit all tasks to the executor
        futures = [executor.submit(worker, url, i) for i, url in enumerate(urls)]

        # Ensure all futures are completed
        for future in as_completed(futures):
            try:
                # No need to do anything here; results are collected in worker
                pass
            except Exception as e:
                print(f"Exception occurred: {e}")

    # Convert results to DataFrame and update the original DataFrame
    results_df = pd.DataFrame(results, columns=['HasRobotsTxt', 'UserAgents'])
    df[['HasRobotsTxt', 'UserAgents']] = results_df

# Process URLs with multithreading
process_urls_multithreaded(gov_websites)