In [None]:
pip install transformers

# Extracting Data From A Given URL

As most people will have experienced when developing a use case or even trying to practise coding, one of the most difficult stages of the task at hand is being able to get enough data that can be used to train your models or identify whether or not your potential solution is robust

### Webscraping

Web scraping is the process of extracting data from websites. It has become an important tool for businesses and researchers to gather data on products, services, and trends, among other things. In recent years, there has been a growing interest in using web scraping to generate text embeddings, which are numerical representations of words or sentences that capture the meaning of the text. These embeddings can be used for a variety of natural language processing (NLP) tasks such as sentiment analysis, language translation, and text classification. Web scraping techniques can be used to collect large amounts of text data from websites, which can then be processed and used to generate high-quality text embeddings.


For the purpose of this training, we will be scraping data from the Accenture careers website https://www.accenture.com/us-en/careers. This is publicly available information that we can send to OpenAI without being too concerned regrading the privacy concerns surrounding sending customer data to OpenAI's API

### Step 1: Loading the relevant libraries

To get started running this code, we first need to load all of the relevant packages we need

In [None]:
import os
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import pandas as pd
import pdfplumber
import traceback
import openai

In [None]:
openai.api_key = 'INSERT_YOUR_YOUR_API_KEY_HERE'

### Step 2: Outline the relevant functions to be executed within the script

In [None]:
HTTP_URL_PATTERN = r'^http[s]*://.+'

class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.hyperlinks = []

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

def get_hyperlinks(url):
    try:
        with urllib.request.urlopen(url) as response:
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    parser = HyperlinkParser()
    parser.feed(html)
    return parser.hyperlinks

def get_domain_hyperlinks(domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        if re.search(HTTP_URL_PATTERN, link):
            url_obj = urlparse(link)
            if url_obj.netloc == domain:
                clean_link = link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:") or link.startswith("javascript:") or 'eDelivery' in link:
                continue
            clean_link = "https://" + domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)

    return list(set(clean_links))

def crawl_with_root(root_url, limit_domain):
    domain = urlparse(root_url).netloc
    queue = deque([root_url])
    seen = set([root_url])

    if not os.path.exists("accenture_text"):
        os.mkdir("accenture_text")

    while queue:
        url = queue.pop()
        if not url.startswith(limit_domain):
            continue
        print(url)

        if 'eDelivery' in url:
            continue

        path = urlparse(url).path
        fname = 'accenture_text/' + path.replace("/", "_") + ".txt"

        try:
            if not os.path.exists(fname):
                if url.endswith(".pdf"):
                    with open("temp.pdf", "wb") as f:
                        f.write(requests.get(url).content)
                    with pdfplumber.open("temp.pdf") as pdf:
                        text = ""
                        for page in pdf.pages:
                            text+= page.extract_text()
                else:
                    soup = BeautifulSoup(requests.get(url).text, "html.parser")
                    text = soup.get_text()
                with open(fname, "w") as f:
                    f.write(text)

            for link in get_domain_hyperlinks(domain, url):
                if link not in seen:
                    queue.append(link)
                    seen.add(link)
        except Exception as e:
            print(e)
            traceback.print_exc()

def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ')
    serie = serie.str.replace('\\n', ' ')
    serie = serie.str.replace('  ',' ')
    serie = serie.str.replace('  ',' ')
    return serie         

Now for a given URL we can extract all of the text from it and all of the assiciated hyperlinks 

We then run the following code to scrape text data from a folder named "accenture_text", which contains multiple files that had been extracted when we passed the url through the 'crawl' function. The text data is then processed and stored in a pandas DataFrame named 'df'. 

In [None]:
crawl_with_root("https://www.accenture.com/us-en/careers", "https://www.accenture.com/us-en/careers")

In [None]:
texts=[]
for file in os.listdir("accenture_text"):
    with open("accenture_text/"+file, "r") as f:
        text = f.read()
        texts.append( (file[:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text) )
df = pd.DataFrame(texts, columns = ['fname', 'text'])

df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('accenture_careers_scraper.csv')
df.head()