In [1]:
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import shutil
import requests
from langchain.vectorstores import Pinecone

import pinecone

  from tqdm.autonotebook import tqdm


In [2]:
# Load secrets to access API
load_dotenv()
#os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_TOKEN')
#openai.api_key = os.environ.get('OPENAI_API_TOKEN')

# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

In [3]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "tcw.de"
full_url = "https://tcw.de/"


In [4]:

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])


In [5]:

# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:
            # If the response is not HTML, return an empty list
            if not response.info().get_content_type() == "text/html":
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks


In [6]:
# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            if link.startswith("/"):
                link = link[1:]
            elif (
                link.startswith("#")
                or link.startswith("mailto:")
                or link.startswith("tel:")
            ):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            clean_links.append(clean_link)
    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))


In [7]:
# For testing and to avoid crawling all pages
def is_blacklisted(url):
    blacklist = ["https://tcw.de/uploads",
                 "https://tcw.de/fachliteratur",
                 "https://tcw.de/publikationen",
                 "https://tcw.de/impressum",
                 "https://tcw.de/news",
                ]
    #blacklist = []
    for blacklisted_url in blacklist:
        if blacklisted_url in url:
            return True
    return False


In [8]:
def crawl(url):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = {url}

    # Create a directory to store the raw html files
    if not os.path.exists("scraper/data/"):
        os.makedirs("scraper/data/")
    
    if not os.path.exists("scraper/data/"+local_domain+"/"):
           os.makedirs("scraper/data/" + local_domain + "/")
            
    # While the queue is not empty, continue crawling
    while queue:
    
    # Get the next URL from the queue
        url = queue.pop()
        print(f"{url} ({len(queue)})") # for debugging and to see the progress
        
        # Define destination
        file_name = local_domain+'/'+url[8:].replace("/", "_") 
        
        resp = requests.get(url)
        # Request content and save in distinct file
        if resp.headers.get('Content-Type').startswith('text/html'):
            html_content = resp.text
            try:
                with open('scraper/data/' + file_name + '.html', 'w') as f:
                    f.write(html_content)
            except Exception as e:
                print(e)
                continue

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url):
            if link not in seen and not is_blacklisted(link):
                queue.append(link)
                seen.add(link) 


In [9]:
# functions that adds a new metadata field containing the source url of each HTML file
def add_source_url(elements):
    for element in elements:
        source_url = "https://" + element.metadata["source"].split("/")[1]\
                                                            .replace("_", "/")\
                                                            .removesuffix(".html")
        element.metadata["source"] = source_url
    return elements

In [10]:
# create a function that takes a element and modifies the page_content by removing HTML tags
def remove_html_tags(elements):
    for element in elements:
        element.page_content = re.sub('<[^<]+?>', ' ', element.page_content)
        element.page_content = re.sub(r'<!--.*?-->', '', element.page_content)
        # remove beginning of HTML comments
        element.page_content = re.sub(r'<!--.*', '', element.page_content)
        # remove end of HTML comments
        element.page_content = re.sub(r'.*-*>', '', element.page_content)
        element.page_content = element.page_content.strip()
    return elements

In [11]:
def remove_duplicates(elements):
    seen = set()
    new_elements = []
    for element in elements:
        if element.page_content not in seen:
            seen.add(element.page_content)
            new_elements.append(element)
    return new_elements

In [12]:
# open temporarily stored HTML files and read relevant content from respective class with BS4
# Use unstructured to go over the retrieved section and structure the data by elements (e.g. title, text, list, etc.)
# store unstructured objects in a list

def retrieve_relevant_content():
    seen = set()
    relevant_content = []
    # create a tmp folder to store the text files which is deleted after the function is executed
    if not os.path.exists("tmp/"):
        os.makedirs("tmp/")
    for file in os.listdir("scraper/data/" + domain + "/"):
        with open("scraper/data/" + domain + "/" + file, "r", encoding="UTF-8") as f:
            # Get the text from the URL using BeautifulSoup
                soup = BeautifulSoup(f, "html.parser")
                text = str(soup.find("div", class_="content_frame_out"))
                # Create a temporary file to store the text
                with open("tmp/" + file, "w", encoding="UTF-8") as f:
                    f.write(text)
    # iterate over file in tmp folder and create UnstructuredFileLoader object and read the files from the tmp folder  
    for file in os.listdir("tmp/"):
        loader = UnstructuredFileLoader("tmp/" + file, strategy="hi_res", mode="elements")
        document = loader.load()
        document = add_source_url(document)
        # not all tags are removed by the unstructured library, so we need to remove them manually
        document = remove_html_tags(document)
        #document = remove_duplicates(document)
        # within the file_data object, iterate over the documents and apppend only the elements with metadata.category == "NarrativeText" and sentence_count(element.page_content) > 1
        
        for element in document:
            if element.page_content not in seen and (element.metadata["category"] == "NarrativeText" or element.metadata["category"] == "ListItem"):
                seen.add(element.page_content)
                relevant_content.append(element)
        
        #relevant_content.append(document)
        #for doc in relevant_content:
        #    for element in doc:
        #        if not (element.metadata["category"] == "NarrativeText" or element.metadata["category"] == "ListItem"):
        #            doc.remove(element)
        #filtered_relevant_content = list(filter(None, relevant_content))
    # delete the tmp folder
    shutil.rmtree("tmp/")
    return relevant_content


In [13]:
final_content = retrieve_relevant_content()

In [66]:
# flatten each document element into  an object with the following structure: {page_content, source_url, filetype, category}
def flatten_document(document):
    flattened_document = []
    for element in document:
        flattened_document.append({"page_content": element.page_content, "source_url": element.metadata["source"], "filetype": element.metadata["filetype"], "category": element.metadata["category"]})
    return flattened_document

data = flatten_document(final_content)

In [55]:
pinecone.init(environment="us-west1-gcp-free", api_key=os.environ.get("PINECONE_API_KEY"))

In [56]:
index = pinecone.Index("tcw-website-embeddings")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [43]:
from tqdm.auto import tqdm
from uuid import uuid4

In [72]:
def create_vector_db(content_documents):
    emeddings = OpenAIEmbeddings()

    # Define TextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    chunks = text_splitter.split_documents(content_documents)
    index_name="tcw-website-embeddings"
    if index_name not in pinecone.list_indexes():
        pinecone.create_index(
            name=index_name,
            metric='cosine',
            dimension=1536 # 1536 dim of text-embedding-ada-002
        )
    index = pinecone.Index(index_name)
    index.describe_index_stats()


    batch_limit = 100

    texts = []
    metadatas = []

    for i, record in enumerate(tqdm(data)):
        # first get metadata fields for this record
        metadata = {
            'source_url': str(record['source_url']),
            'filetype': record['filetype'],
            'category': record['category']
        }
        # now we create chunks from the record text
        record_texts = text_splitter.split_text(record['page_content'])
        # create individual metadata dicts for each chunk
        record_metadatas = [{
            "chunk": j, "text": text, **metadata
        } for j, text in enumerate(record_texts)]
        # append these to current batches
        texts.extend(record_texts)
        metadatas.extend(record_metadatas)
        # if we have reached the batch_limit we can add texts
        if len(texts) >= batch_limit:
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = emeddings.embed_documents(texts)
            index.upsert(vectors=zip(ids, embeds, metadatas))
            texts = []
            metadatas = []

    if len(texts) > 0:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = emeddings.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))

    index.describe_index_stats()

    print("Database created")

In [73]:
create_vector_db(final_content)

  0%|          | 0/653 [00:00<?, ?it/s]

Database created


In [74]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index("tcw-website-embeddings")
emeddings = OpenAIEmbeddings()

vectorstore = Pinecone(
    index, emeddings.embed_query, text_field
)

In [75]:
query = "Wer ist Prof. Dr. Wildemann?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='Prof. Wildemann', metadata={'category': 'ListItem', 'chunk': 0.0, 'filetype': 'text/html', 'source_url': 'https://tcw.de/sitemap'}),
 Document(page_content='In 40 Büchern und über 700 Aufsätzen, die in engem Kontakt mit der Praxis entstanden sind, hat er neue Wege für die wirtschaftliche Gestaltung von Unternehmen mit Zukunft aufgezeigt. Durch die Kombination von Forschungsinstitut und Management Consulting schafft er es immer wieder, die gewonnen Ergebnisse aus Forschung und unternehmerischer Praxis zu integrieren. Für führende Industrieunternehmen ist Professor Wildemann als Berater, Aufsichts- und Beiratsmitglied tätig. Ihm wurden die Staatsmedaille des Freistaates Bayern, das Bundesverdienstkreuz 1. Klasse der Bundesrepublik Deutschland und die Ehrendoktorwürden der Universitäten Klagenfurt, Passau und Cottbus verliehen. Seit 2004 ist er in die Logistik Hall of Fame aufgenommen worden. 2006 erhielt Professor Wildemann vom Bayerischen Ministerpräsidenten den 

In [None]:
# rename function to main and run as one piece
def webscraper():
     #crawl(full_url)
     #print("Crawling successful!")
     relevant_content = retrieve_relevant_content()
     print("Relevant content retrieved!")
     create_vector_db(relevant_content)
     print("ChromaDB successfully created!")

In [None]:
create_vector_db(relevant_content) 


# Testing

In [None]:
embeddings = OpenAIEmbeddings()
db = Chroma(persist_directory="chroma_db_single_mode", embedding_function=embeddings, collection_name="tcw_chroma_collection")

In [None]:
db.similarity_search_with_score("Wer ist Prof. Dr. Wildemann?")

In [None]:
db.similarity_search_with_score("Was ist die 5-S-Methode?")

In [None]:
db.max_marginal_relevance_search("Wer ist Prof. Dr. Wildemann?")

In [None]:
import pandas as pd

data = pd.read_parquet('chroma_db/chroma-embeddings.parquet')

In [None]:
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False
%sql duckdb:///:memory:


In [None]:
%%sql
describe table 'chroma_db/chroma-embeddings.parquet';

In [None]:
%%sql
create table embeddings as
select * 
from 'chroma_db/chroma-embeddings.parquet';

In [None]:
%%sql
select document from embeddings where metadata LIKE '%5-s-konzept-als%' LIMIT 100;


## ChromaDB Testing

In [None]:
from chromadb.config import Settings
import chromadb

client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="chroma_db",   
))

In [None]:
client.list_collections()

In [None]:
collection = client.get_collection("tcw_chroma_collection", embedding_function=OpenAIEmbeddings())

In [None]:
collection.peek()

In [None]:
collection.get(
    where={"source_url": "https://tcw.de/news/komplexitaet-in-der-beschaffung-abbauen-180"}
)

In [None]:
{"source": "tmp/tcw.de_news_komplexitaet-in-der-beschaffung-abbauen-180.html", "filename": "tmp/tcw.de_news_komplexitaet-in-der-beschaffung-abbauen-180.html", "category": "NarrativeText", "source_url": "https://tcw.de/news/komplexitaet-in-der-beschaffung-abbauen-180"}