
# Career Counseling Chatbot (LLM + RAG)

This notebook demonstrates how to create a retrieval-augmented chatbot using data from berufsberating.ch, OpenAI embeddings, ChromaDB, and LangChain.


## Libraries and settings

In [None]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Libraries
import io
import os
import re
import json
import queue
import PyPDF2
import textwrap
import keyboard
import threading
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import (
            ChatPromptTemplate,
            SystemMessagePromptTemplate,
            HumanMessagePromptTemplate)

import openai
from pydub import AudioSegment
from pydub.playback import _play_with_simpleaudio

# Load the .env file
load_dotenv()

# Path
path = '/home/ec2-user/SageMaker/career_counseling_chatbot'


# Show working directory
print("Current working directory:", os.getcwd())


## Load occupation data

In [None]:
# Load occupation data
with open(path + '/data/processed/berufsberatung_occupations_de.json', 
          'r', 
          encoding='utf-8') as f:
    data = json.load(f)

# Prepare documents for embedding
documents = []
metadatas = []

# Track any issues for reporting
issues = []

for i, item in enumerate(data):
    try:
        occupation_name = item.get('Name', f"Unbekannt_{i}")
        # print(f"Processing {occupation_name}...")

        # Extract related occupations
        related_occupations = []
        if 'Related' in item and isinstance(item['Related'], list):
            for related in item['Related']:
                if isinstance(related, dict) and 'name' in related:
                    related_occupations.append(related['name'])
        
        # Format related occupations section
        related_section = ""
        if related_occupations:
            related_section = "Verwandte Berufe: " + ", ".join(related_occupations)
        else:
            related_section = "Verwandte Berufe: Keine verwandten Berufe angegeben"

        # Safely extract fields with defaults for missing values
        # Create a comprehensive document from all relevant fields
        content = f"""
        Name: {item.get('Name', 'NA')}
        Beschreibung: {item.get('Description', 'NA')}
        Bildungstypen: {item.get('Bildungstypen', 'NA')}
        Berufsfelder: {item.get('Berufsfelder', 'NA')}
        Tätigkeiten: {item.get('Tätigkeiten', 'NA')}
        Ausbildung: {item.get('Ausbildung', 'NA')}
        Voraussetzungen: {item.get('Voraussetzungen', 'NA')}
        Weiterbildung: {item.get('Weiterbildung', 'NA')}
        Berufsverhältnisse: {item.get('Berufsverhältnisse', 'NA')}
        {related_section}
        """
        
        # Add document to list
        documents.append(content)
        
        # Create metadata with safe extraction
        metadata = {
            'id': item.get('ID', f"unknown_{i}"),
            'name': item.get('Name', 'Unknown Occupation'),
            'url': item.get('URL', 'NA'),
            'related_ids': [related.get('id', 'NA') for related in item.get('Related', [])]
        }
        metadatas.append(metadata)
        
    except Exception as e:
        error_msg = f"Error processing item {i}: {str(e)}"
        print(f"WARNING: {error_msg}")
        issues.append(error_msg)
        # Continue with next item rather than failing the entire process

print(f"Loaded {len(documents)} occupation documents.")
if issues:
    print(f"Encountered {len(issues)} issues during processing.")

# Example: Display the first document
# if documents:
#    print("\nExample of prepared document:")
#    print(documents[0][:500] + "...")

# Store documents and metadata for later use
occupation_data = {
    'documents': documents,
    'metadatas': metadatas,
    'processing_issues': issues
}

# Save the prepared data
with open(path + '/data/processed/occupation_data_prepared.json', 'w', 
          encoding='utf-8') as f:
    json.dump(occupation_data, f, ensure_ascii=False, indent=2)

print("Data preparation complete and saved to occupation_data_prepared.json")

## Create embeddings

In [None]:
# Check whether embeddings are already created
if os.path.exists(path + "/data/processed/chroma_db") and \
    os.listdir(path + "/data/processed/chroma_db"):
    print("Embeddings already exist. Skipping embedding creation.")

else:
    
    try:

        # Create embeddings
        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

        # Store embeddings in Chroma vectorstore
        vectorstore = Chroma.from_texts(documents, 
                                        embeddings, 
                                        persist_directory=path + \
                                        "/data/processed/chroma_db")

        # Persist vectorstore
        vectorstore.persist()

        # Print sucess message
        print("Embeddings created and stored successfully.")

    except Exception as e:
        print(f"Error creating and storing embeddings: {str(e)}")

## Load embeddings from vectorstore

In [None]:
# Create embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Load the vectorstore from the persisted directory
vectorstore = Chroma(persist_directory=path + "/data/processed/chroma_db", 
                     embedding_function=embeddings)


## Initialize LLM

In [None]:
# Initialize LLM
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0)

# Define the system prompt (instructions for the AI)
system_template = """
Du bist ein hilfreicher Berufsberater der Nutzern bei der Karriereplanung hilft.
Antworte präzise, klar und freundlich basierend auf folgenden Kontext:

{context}

Wenn in den Kontext-Informationen keine passende Antwort gefunden wird,
antworte ehrlich, dass du die Information nicht hast.
"""
system_prompt = SystemMessagePromptTemplate.from_template(system_template)

# Define the human prompt (user's question)
human_template = "{question}"
human_prompt = HumanMessagePromptTemplate.from_template(human_template)

# Combine system and human prompts into a ChatPromptTemplate
chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])

# Initialize RetrievalQA Chain with the custom prompt
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    chain_type="stuff",
    return_source_documents=True,
    chain_type_kwargs={"prompt": chat_prompt}
)

print("Chatbot is ready for use.")


## Functions to query the chatbot and get response

In [None]:
# Function to format the chatbot response
def format_text(text, width=80):
    lines = []
    for line in text.split('\n'):
        while len(line) > width:
            split_index = line.rfind(' ', 0, width)
            if split_index == -1:
                split_index = width
            lines.append(line[:split_index])
            line = line[split_index:].lstrip()
        lines.append(line)

    return '\n'.join(lines)

# Function to query the chatbot
def ask_chatbot(question, width=80):
    result = qa_chain(question)

    # Response
    response = result['result']

    # Source documents
    source_documents = result['source_documents']
    
    # Format and return the response
    return format_text(response, width), source_documents


# Function to warm up the LLM and reduce latency
def warm_up_chatbot():
    try:
        # Minimal request to warm up the LLM
        response = qa_chain("Wie geht es dir?")
        print("LLM warmed up successfully.")
    except Exception as e:
        print(f"Error warming up LLM: {str(e)}")
_ = ask_chatbot("Warm up!")


## Function for audio response

In [None]:
from IPython.display import Audio, display, clear_output
from openai import OpenAI
import ipywidgets as widgets

def openai_text_to_audio_with_widget(text, voice='onyx', speed=1.0):
    client = OpenAI()
    
    # Create TTS request
    response = client.audio.speech.create(
        model="tts-1",
        voice=voice,
        input=text,
        speed=speed
    )

    # Manually stream audio and save it to a file
    audio_file = "../../data/processed/temp_speech.mp3"
    with open(audio_file, "wb") as f:
        for chunk in response.iter_bytes():
            f.write(chunk)
    
    # Create audio player widget
    audio = Audio(audio_file, autoplay=True)
    
    return audio


## Ask your Chatbot for Career Advice ...

### Occupation-based ...

In [None]:
%%time

# Clear output
clear_output(wait=True)

# Your question
response, source_docs = ask_chatbot("""Was macht ein Architekt?
                                       Welche Fähigkeiten sind wichtig?
                                       Welche Karrierepfade gibt es?
                                       Welche verwandten Berufe gibt es?""")

# Show response
print(f"Chatbot: {response}", "\n")

# Play response as audio
# openai_text_to_audio_with_widget(response, voice='onyx', speed=1.2)

# Print source documents in a readable format
# for i, doc in enumerate(source_docs):
#     print(f"Document {i+1}:")
#     print(f"Source: {doc.metadata.get('name', 'Unknown')}")
#     print(f"Content (excerpt): {doc.page_content[:200]}...")
#     print("-" * 50)

### Skill-based ...

In [None]:
%%time

# Clear output
clear_output(wait=True)

# Your question
response, source_docs = ask_chatbot("""Welche Berufe gibt es für meine Fähigkeiten?:
                                       - bin gerne draussen
                                       - kann gut mit Tieren umgehen
                                       - kann gut mit Maschinen umgehen, z.B. Traktor fahren
                                       - habe erste Erfahrungen mit Gemüseanbau
                                       - habe Erfahrungen mit Holzarbeiten""")

# Show response
print(f"Chatbot: {response}", "\n")

# Play response as audio
# openai_text_to_audio_with_widget(response, voice='onyx', speed=1.2)

# Print source documents in a readable format
# for i, doc in enumerate(source_docs):
#     print(f"Document {i+1}:")
#     print(f"Source: {doc.metadata.get('name', 'Unknown')}")
#     print(f"Content (excerpt): {doc.page_content[:200]}...")
#     print("-" * 50)


### CV-based ...

In [None]:
%%time

# Clear output
clear_output(wait=True)

# Load CV PDF file)
pdf_file_path = path + "/data/raw/cv_hanna_krause.pdf"

# Read PDF file and extract text
try:
    with open(pdf_file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        cv_text = ""
        for page in reader.pages:
            cv_text += page.extract_text()
    
    # Close the file
    file.close()

    # Format text
    cv_text = cv_text.strip()
    cv_text = textwrap.fill(cv_text, width=80)
    print("CV Text:")
    print(cv_text)

except FileNotFoundError:
    print(f"File not found: {pdf_file_path}")
except Exception as e:
    print(f"An error occurred while reading the PDF: {e}")

# Your question
response, source_docs = ask_chatbot(f"""Welche Vorschläge für meine weitere Kariere 
                                        auf der Basis der Infos aus meinem CV hast Du? 
                                        Du findest meinen CV hier: {cv_text}""")

# Show response
print("\n")
print(f"Chatbot: {response}", "\n")

# Play response as audio
# openai_text_to_audio_with_widget(response, voice='onyx', speed=1.2)

# Print source documents in a readable format
# for i, doc in enumerate(source_docs):
#     print(f"Document {i+1}:")
#     print(f"Source: {doc.metadata.get('name', 'Unknown')}")
#     print(f"Content (excerpt): {doc.page_content[:200]}...")
#     print("-" * 50)

# Print prompt used
# print("\nPrompt used:")
# print(qa_chain.prompt.messages[0].content)
