In [2]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque

from html.parser import HTMLParser
from urllib.parse import urlparse
import os

# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

domain = "openai.com" # <- put your domain to be crawled
full_url = "https://openai.com/" # <- put your domain to be crawled with https or http

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

In [3]:
# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

In [4]:
# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

In [5]:
def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])
    print(queue)

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])
    print(seen)

    # Create a directory to store the text files
    if not os.path.exists("text/"):
            os.mkdir("text/")

    if not os.path.exists("text/"+local_domain+"/"):
            os.mkdir("text/" + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists("processed"):
            os.mkdir("processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        try:
            with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:

                # Get the text from the URL using BeautifulSoup
                soup = BeautifulSoup(requests.get(url).text, "html.parser")

                # Get the text but remove the tags
                text = soup.get_text()

                # If the crawler gets to a page that requires JavaScript, it will stop the crawl
                if ("You need to enable JavaScript to run this app." in text):
                    print("Unable to parse page " + url + " due to JavaScript being required")

                # Otherwise, write the text to the file in the text directory
                f.write(text)

            # Get the hyperlinks from the URL and add them to the queue
            print(local_domain,url)
            for link in get_domain_hyperlinks(local_domain, url):
                if link not in seen:
                    queue.append(link)
                    seen.add(link)
        except:
            continue

crawl(full_url)

deque(['https://openai.com/'])
{'https://openai.com/'}
https://openai.com/
openai.com https://openai.com/
https://openai.com/research/whisper
openai.com https://openai.com/research/whisper
https://openai.com/research?contentTypes=publication
https://openai.com/research?topics=transformers
https://openai.com/research?topics=speech-recognition
https://openai.com/research?contentTypes=milestone
https://openai.com/research?models=whisper
https://openai.com/research?topics=open-source
https://openai.com/research?contentTypes=release
https://openai.com/research/whisper#content
openai.com https://openai.com/research/whisper#content
https://openai.com/product/gpt-4
openai.com https://openai.com/product/gpt-4
https://openai.com/contributions/gpt-4
openai.com https://openai.com/contributions/gpt-4
https://openai.com/contributions/gpt-4#content
openai.com https://openai.com/contributions/gpt-4#content
https://openai.com/waitlist/gpt-4-api
openai.com https://openai.com/waitlist/gpt-4-api
https://o

https://openai.com/research/openai-five#thegames
openai.com https://openai.com/research/openai-five#thegames
https://openai.com/research?authors=michael-petrov
https://openai.com/research?authors=christopher-berner
https://openai.com/research?authors=przemyslaw-debiak
https://openai.com/research?topics=software-engineering
https://openai.com/research?authors=susan-zhang
https://openai.com/research?authors=jack-clark
https://openai.com/research/learning-a-hierarchy
openai.com https://openai.com/research/learning-a-hierarchy
https://openai.com/research/learning-a-hierarchy#content
openai.com https://openai.com/research/learning-a-hierarchy#content
https://openai.com/research?authors=kevin-frans
https://openai.com/research?authors=jonathan-ho
https://openai.com/research?authors=pieter-abbeel
https://openai.com/research?authors=peter-chen
https://openai.com/research?topics=self-play
https://openai.com/research?topics=dota-2
https://openai.com/research/openai-baselines-ppo
openai.com https:

https://openai.com/blog/welcome-pieter-and-shivon#IlyaSutskever
openai.com https://openai.com/blog/welcome-pieter-and-shivon#IlyaSutskever
https://openai.com/research/openai-gym-beta#content
openai.com https://openai.com/research/openai-gym-beta#content
https://openai.com/research/generative-models#content
openai.com https://openai.com/research/generative-models#content
https://openai.com/research?authors=durk-kingma
https://openai.com/blog/generative-models/#going-forward
openai.com https://openai.com/blog/generative-models/#going-forward
https://openai.com/blog?authors=sam-altman
https://openai.com/research?contentTypes=conclusion
https://openai.com/research?authors=rachel-fong
https://openai.com/research?authors=jerry-tworek
https://openai.com/research?topics=transfer-learning
https://openai.com/blog/solving-rubiks-cube/#perturbations
openai.com https://openai.com/blog/solving-rubiks-cube/#perturbations
https://openai.com/research?authors=arthur-petron
https://openai.com/research?au

openai.com https://openai.com/blog/solving-rubiks-cube/#challenges
https://openai.com/research?topics=robotics
https://openai.com/research/solving-rubiks-cube#content
openai.com https://openai.com/research/solving-rubiks-cube#content
https://openai.com/research?authors=nikolas-tezak
https://openai.com/research?authors=alex-paino
https://openai.com/research?authors=matthias-plappert
https://openai.com/research?topics=sim-to-real
https://openai.com/research?models=dactyl
https://openai.com/research?authors=lilian-weng
https://openai.com/research?authors=ilge-akkaya
https://openai.com/research?authors=glenn-powell
https://openai.com/research?authors=lei-zhang
https://openai.com/research?authors=raphael-ribas
https://openai.com/research?authors=qiming-yuan
https://openai.com/research?authors=peter-welinder
https://openai.com/commitment-to-dei
openai.com https://openai.com/commitment-to-dei
https://openai.com/commitment-to-dei#content
openai.com https://openai.com/commitment-to-dei#content


https://openai.com/blog/forecasting-misuse
openai.com https://openai.com/blog/forecasting-misuse
https://openai.com/research/forecasting-misuse#content
openai.com https://openai.com/research/forecasting-misuse#content
https://openai.com/blog?authors=lama-ahmad
https://openai.com/research/cooperation-on-safety
openai.com https://openai.com/research/cooperation-on-safety
https://openai.com/research/cooperation-on-safety#content
openai.com https://openai.com/research/cooperation-on-safety#content
https://openai.com/research/dall-e-2-pre-training-mitigations
openai.com https://openai.com/research/dall-e-2-pre-training-mitigations
https://openai.com/research?models=dall-e-2
https://openai.com/dall-e-2
openai.com https://openai.com/dall-e-2
https://openai.com/blog/dall-e-introducing-outpainting
openai.com https://openai.com/blog/dall-e-introducing-outpainting
https://openai.com/blog/dall-e-introducing-outpainting#content
openai.com https://openai.com/blog/dall-e-introducing-outpainting#conte

openai.com https://openai.com/blog/openai-scholars#content
https://openai.com/blog?authors=larissa-schiavo
https://openai.com/blog/openai-scholars#GregBrockman
openai.com https://openai.com/blog/openai-scholars#GregBrockman
https://openai.com/blog/openai-fellows-interns-2019
openai.com https://openai.com/blog/openai-fellows-interns-2019
https://openai.com/blog/openai-fellows-interns-2019#content
openai.com https://openai.com/blog/openai-fellows-interns-2019#content
https://openai.com/blog/openai-fellows-interns-2019#LarissaSchiavo
openai.com https://openai.com/blog/openai-fellows-interns-2019#LarissaSchiavo
https://openai.com/blog?authors=ashley-pilipiszyn
https://openai.com/blog/openai-fellows-interns-2019#AshleyPilipiszyn
openai.com https://openai.com/blog/openai-fellows-interns-2019#AshleyPilipiszyn
https://openai.com/blog/openai-lp#OpenAI
openai.com https://openai.com/blog/openai-lp#OpenAI
https://openai.com/blog/openai-lp#GregBrockman
openai.com https://openai.com/blog/openai-lp#G

https://openai.com/blog?authors=joanne-jang
https://openai.com/blog?authors=boris-power
https://openai.com/blog/introducing-text-and-code-embeddings#JoanneJang
openai.com https://openai.com/blog/introducing-text-and-code-embeddings#JoanneJang
https://openai.com/blog?authors=lilian-weng
https://openai.com/blog/introducing-text-and-code-embeddings#content
openai.com https://openai.com/blog/introducing-text-and-code-embeddings#content
https://openai.com/blog?authors=arvind-neelakantan
https://openai.com/careers/research-engineer#content
openai.com https://openai.com/careers/research-engineer#content
https://openai.com/blog/grade-school-math
openai.com https://openai.com/blog/grade-school-math
https://openai.com/blog/grade-school-math/#samples
openai.com https://openai.com/blog/grade-school-math/#samples
https://openai.com/research?authors=karl-cobbe
https://openai.com/research?authors=vineet-kosaraju
https://openai.com/research/solving-math-word-problems#content
openai.com https://openai.

openai.com https://openai.com/careers/software-engineering-manager-accelerator-optimization
https://openai.com/careers/software-engineering-manager-accelerator-optimization#content
openai.com https://openai.com/careers/software-engineering-manager-accelerator-optimization#content
https://openai.com/careers/tech-lead-manager-identity-platform
openai.com https://openai.com/careers/tech-lead-manager-identity-platform
https://openai.com/careers/tech-lead-manager-identity-platform#content
openai.com https://openai.com/careers/tech-lead-manager-identity-platform#content
https://openai.com/careers/research-scientist-machine-learning
openai.com https://openai.com/careers/research-scientist-machine-learning
https://openai.com/careers/research-scientist-machine-learning#content
openai.com https://openai.com/careers/research-scientist-machine-learning#content
https://openai.com/careers/revenue-accounting-manager
openai.com https://openai.com/careers/revenue-accounting-manager
https://openai.com/c

openai.com https://openai.com/blog/governance-of-superintelligence
https://openai.com/blog/governance-of-superintelligence#GregBrockman
openai.com https://openai.com/blog/governance-of-superintelligence#GregBrockman
https://openai.com/blog/governance-of-superintelligence#IlyaSutskever
openai.com https://openai.com/blog/governance-of-superintelligence#IlyaSutskever
https://openai.com/blog/governance-of-superintelligence#SamAltman
openai.com https://openai.com/blog/governance-of-superintelligence#SamAltman
https://openai.com/blog/governance-of-superintelligence#content
openai.com https://openai.com/blog/governance-of-superintelligence#content
https://openai.com/blog?topics=responsible-ai
https://openai.com/blog/new-ways-to-manage-your-data-in-chatgpt
openai.com https://openai.com/blog/new-ways-to-manage-your-data-in-chatgpt
https://openai.com/policies/api-data-usage-policies
openai.com https://openai.com/policies/api-data-usage-policies
https://openai.com/policies/api-data-usage-policies

https://openai.com/blog/dall-e-2-update#content
openai.com https://openai.com/blog/dall-e-2-update#content
https://openai.com/blog/dall-e-now-available-in-beta
openai.com https://openai.com/blog/dall-e-now-available-in-beta
https://openai.com/blog/dall-e-now-available-in-beta#content
openai.com https://openai.com/blog/dall-e-now-available-in-beta#content
https://openai.com/blog/dall-e-now-available-in-beta#OpenAI
openai.com https://openai.com/blog/dall-e-now-available-in-beta#OpenAI
https://openai.com/blog/dall-e-now-available-without-waitlist#content
openai.com https://openai.com/blog/dall-e-now-available-without-waitlist#content
https://openai.com/blog/dall-e-now-available-without-waitlist#OpenAI
openai.com https://openai.com/blog/dall-e-now-available-without-waitlist#OpenAI
https://openai.com/research/gpt-4
openai.com https://openai.com/research/gpt-4
https://openai.com/research/gpt-4#content
openai.com https://openai.com/research/gpt-4#content
https://openai.com/research?models=gpt

https://openai.com/customer-stories/yabble
openai.com https://openai.com/customer-stories/yabble
https://openai.com/customer-stories/yabble#content
openai.com https://openai.com/customer-stories/yabble#content
https://openai.com/customer-stories/inworld-ai
openai.com https://openai.com/customer-stories/inworld-ai
https://openai.com/customer-stories/inworld-ai#content
openai.com https://openai.com/customer-stories/inworld-ai#content
https://openai.com/customer-stories/waymark
openai.com https://openai.com/customer-stories/waymark
https://openai.com/customer-stories/waymark#content
openai.com https://openai.com/customer-stories/waymark#content
https://openai.com/customer-stories#content
openai.com https://openai.com/customer-stories#content
https://openai.com/research/forecasting-misuse
openai.com https://openai.com/research/forecasting-misuse
https://openai.com/product/dall-e-2
openai.com https://openai.com/product/dall-e-2
https://openai.com/blog/gpt-3-edit-insert
openai.com https://op

In [6]:
def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ')
    serie = serie.str.replace('\\n', ' ')
    serie = serie.str.replace('  ', ' ')
    serie = serie.str.replace('  ', ' ')
    return serie

In [7]:
import pandas as pd

# Create a list to store the text files
texts=[]

# Get all the text files in the text directory
for file in os.listdir("text/" + domain + "/"):

    # Open the file and read the text
    with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:
        text = f.read()

        # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
        texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))

# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['fname', 'text'])

# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('processed/scraped.csv')
df.head()

  serie = serie.str.replace('\\n', ' ')


Unnamed: 0,fname,text
0,,. OpenAI CloseSearch Submit Skip to main co...
1,#content,#content. OpenAI CloseSearch Submit Skip to...
2,,. OpenAI CloseSearch Submit Skip to main co...
3,about#content,about#content. About CloseSearch Submit Ski...
4,about,about. About CloseSearch Submit Skip to mai...


In [8]:
df.sample(100)

Unnamed: 0,fname,text
515,research jukebox#content,research jukebox#content. Jukebox CloseSear...
62,blog dall e now available in beta#content,blog dall e now available in beta#content. DA...
502,research image gpt#conclusion,research image gpt#conclusion. Image GPT Cl...
517,research language model safety and misuse#content,research language model safety and misuse#cont...
81,blog discovering the minutiae of backend syste...,blog discovering the minutiae of backend syste...
...,...,...
306,careers manager scientist deployment,careers manager scientist deployment. Manager...
390,customer stories khan academy,customer stories khan academy. Khan Academy ...
31,blog chatgpt plugins,blog chatgpt plugins. ChatGPT plugins Close...
545,research openai baselines dqn#content,research openai baselines dqn#content. OpenAI...


In [16]:
df.fname[75]

'blog democratic inputs to ai#LamaAhmad'

In [17]:
df.text[75]

'blog democratic inputs to ai#LamaAhmad.  Democratic Inputs to AI   CloseSearch Submit Skip to main contentSite NavigationResearchOverviewIndexProductOverviewChatGPTGPT-4DALLÂ·E 2Customer storiesSafety standardsPricingDevelopersOverviewDocumentationAPI referenceExamplesSafetyCompanyAboutBlogCareersCharterSecuritySearch Navigation quick links Log inSign upMenu Mobile Navigation CloseSite NavigationResearchProductDevelopersSafetyCompany Quick Links Log inSign upSearch Submit Democratic Inputs to AIOur nonprofit organization, OpenAI, Inc., is launching a program to award ten $100,000 grants to fund experiments in setting up a democratic process for deciding what rules AI systems should follow, within the bounds defined by the law.Illustration: Justin Jay WangMay 25, 2023AuthorsWojciech ZarembaArka DharLama AhmadTyna EloundouShibani SanturkarSandhini AgarwalJade LeungAnnouncements,Â\xa0Responsible AIAI will have significant, far-reaching economic and societal impacts. Technology shapes the