In [276]:
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
import openai
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader, UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import shutil


In [128]:

# Load secrets to access API
load_dotenv()
openai.api_key = os.environ.get('OPENAI_API_TOKEN')
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_TOKEN')

In [129]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "tcw.de"
full_url = "https://tcw.de/"


In [130]:

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])


In [131]:

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:
            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks


In [132]:


# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)
    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))


In [133]:

def is_blacklisted(url):
    blacklist = ["https://tcw.de/uploads",
                 "https://tcw.de/referenzen",
                 "https://tcw.de/fachliteratur",
                 "https://tcw.de/publikationen",
                 "https://tcw.de/news",
                 "https://tcw.de/referrals",
                 "https://tcw.de/impressum",
                ]
    for blacklisted_url in blacklist:
        if blacklisted_url in url:
            return True
    return False


In [191]:
from urllib.request import Request, urlopen
def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the raw html files
    if not os.path.exists("scraper/html/"):
        os.makedirs("scraper/html/")
    
    if not os.path.exists("scraper/html/"+local_domain+"/"):
           os.makedirs("scraper/html/" + local_domain + "/")
            
    # While the queue is not empty, continue crawling
    while queue:
    
    # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress
        
        # Define destination
        file_name = local_domain+'/'+url[8:].replace("/", "_") 
        #from urllib.request import Request, urlopen
        #from urllib.error import URLError
        #req = Request(url)
        #response = urlopen(req)
        #print(response.info())
        # Request content and save in distinct .html file
        urllib.request.urlretrieve(url, 'scraper/html/' + file_name + ".html")
        
        # combine response.info() and retrieved html file to create a single file
       

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen and not is_blacklisted(link):
                queue.append(link)
                seen.add(link) 


In [241]:
def add_source_url(elements):
    for element in elements:
        source_url = "https://" + element.metadata["source"].split("/")[1]\
                                                            .replace("_", "/")\
                                                            .removesuffix(".html")
        element.metadata["source_url"] = source_url
    return elements

In [285]:
# create a function that takes a element and modifies the page_content by removing HTML tags
def remove_html_tags(elements):
    for element in elements:
        element.page_content = re.sub('<[^<]+?>', '', element.page_content)
    return elements

In [359]:
#from unstructured.partition.text_type import sentence_count
def retrieve_relevant_content():
    """
    This function retrieves the relevant content from the crawled websites.
    """
    relevant_content = []
    # create a tmp folder to store the text files which is deleted after the function is executed
    if not os.path.exists("tmp/"):
        os.makedirs("tmp/")
    for file in os.listdir("scraper/html/" + domain + "/"):
        with open("scraper/html/" + domain + "/" + file, "r", encoding="UTF-8") as f:
            # Get the text from the URL using BeautifulSoup
                soup = BeautifulSoup(f, "html.parser")
                text = str(soup.find("div", class_="content_frame_out"))
                # Create a temporary file to store the text
                with open("tmp/" + file, "w", encoding="UTF-8") as f:
                    f.write(text)
    # iterate over file in tmp folder and create UnstructuredFileLoader object and read the files from the tmp folder  
    for file in os.listdir("tmp/"):
        loader = UnstructuredFileLoader("tmp/" + file, strategy="hi_res", mode="elements")
        document = loader.load()
        document = add_source_url(document)
        # not all tags are removed by the unstructured library, so we need to remove them manually
        document = remove_html_tags(document)
        # within the file_data object, iterate over the documents and apppend only the elements with metadata.category == "NarrativeText" and sentence_count(element.page_content) > 1
        relevant_content.append(document)
        for doc in relevant_content:
            for element in doc:
                if not (element.metadata["category"] == "NarrativeText" or element.metadata["category"] == "ListItem"):
                    doc.remove(element)
        filtered_relevant_content = list(filter(None, relevant_content))
    # delete the tmp folder
    shutil.rmtree("tmp/")
    return filtered_relevant_content


In [385]:
def create_vector_db(content_documents):
    """
    This function creates a vector database from the relevant content.
    """
    # Define TextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = []
    for doc in content_documents:
        texts.append(text_splitter.split_documents(doc))
    texts_only = [text for sublist in texts for text in sublist]
    # Define Embedding
    embeddings = OpenAIEmbeddings()
    db = Chroma.from_documents(documents = texts_only,
                               embedding= embeddings,
                               collection_name="tcw_chroma_collection",
                               persist_directory="chroma_db")


In [None]:

def webscraper():
     crawl(full_url)
     print("Crawling successful!")
     relevant_content = retrieve_relevant_content()
     print("Relevent content retrieved!")
     create_vector_db(relevant_content)
     print("ChromaDB successfully created!")



## ChromaDB Testing

In [404]:
from chromadb.config import Settings
import chromadb

client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="chroma_db" # Optional, defaults to .chromadb/ in the current directory
))
client.list_collections()

Using embedded DuckDB with persistence: data will be stored in: chroma_db
No embedding_function provided, using default embedding function: SentenceTransformerEmbeddingFunction


[Collection(name=tcw_chroma_collection)]

In [407]:
collection = client.get_collection("tcw_chroma_collection", embedding_function=OpenAIEmbeddings())

In [406]:
collection.peek()

{'ids': ['6ba40f30-deba-11ed-8f61-22254a378369',
  '6ba41cbe-deba-11ed-8f61-22254a378369',
  '6ba40184-deba-11ed-8f61-22254a378369',
  '6ba4018e-deba-11ed-8f61-22254a378369',
  '6ba3f392-deba-11ed-8f61-22254a378369',
  '6ba3f3a6-deba-11ed-8f61-22254a378369',
  '6ba3f3ba-deba-11ed-8f61-22254a378369',
  '6ba3f3ce-deba-11ed-8f61-22254a378369',
  '6ba3f3d8-deba-11ed-8f61-22254a378369',
  '6ba3f3ec-deba-11ed-8f61-22254a378369'],
 'embeddings': [[-0.0052505399871669795,
   -0.04260141313925152,
   0.007911505944478944,
   -0.01336973039580664,
   -0.020976198401117613,
   0.014524979214925316,
   -0.016588849602015215,
   0.027933651000734857,
   -0.0008688702183835487,
   -0.011403212086740163,
   -0.0012120374924938972,
   0.029153800894761626,
   -0.0174715124164038,
   -0.012019777420641123,
   0.009942926259776073,
   0.005065570480128947,
   0.03133449638318279,
   -0.0012323192530256561,
   0.009384772545218186,
   -0.009845573250430097,
   -0.01680951530561236,
   -0.0050331194770136