# Lyabs journey


This project aims to provide hands-on practice with Retrieval-Augmented Generation (RAG) using LLMs and LangChain.

It brings together a developer's Curriculum Vitae, personal projects with descriptions, and client projects with relevant details. The data is stored in a JSON file containing paths to source files, which may be in Markdown or PDF format. All data is transformed into a vector database for use with LangChain in RAG workflows.

There are three main outcomes:

- The first is a chat assistant where users can ask questions about the developer or their projects.
- The second is a fun chatbot that playfully mocks the developer's experience and projects, using humor, emojis, and punchlines.
- The third (and perhaps most practical) is a tool to generate professional proposal texts based on client requirements.

##### Read the content of the data

In [None]:
import json
from tqdm import tqdm

# Read the JSON data
with open('data/data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Process each item with a progress bar
for item in tqdm(data, desc="Processing items"):
    print(item["name"])

# The total number of items
print(f"Total items: {len(data)}")

In [None]:
# Do some tests

# Get all data where purpose is "hackathon"
hackathon_data = [item for item in data if item["metadata"].get("purpose") == "hackathon"]
print(f"Number of hackathon items: {len(hackathon_data)}")

# The data with "AI" in technologies
ai_technology_data = [item for item in data if "AI" in item["metadata"].get("technologies", [])]
print(f"Number of items with AI technology: {len(ai_technology_data)}")

# Print names of hackathon items
print("Hackathon items:")
for item in hackathon_data:
    print(f"- {item['name']}")

# Group all hackathon items by their metadata name
from collections import defaultdict
hackathon_by_name = defaultdict(list)
for item in hackathon_data:
    hackathon_by_name[item["metadata"]["name"]].append(item)

print("Hackathon items grouped by name:")
for name, items in hackathon_by_name.items():
    print(f"{name}: {len(items)} items")

##### Do Necessary imports for RAG

```python

In [None]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [None]:
%pip install pypdf

In [None]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain_core.callbacks import StdOutCallbackHandler

In [None]:
# Initialize the model, database, and other components here
MODEL = "gpt-4o-mini"
MODEL_4O = "gpt-4o"
MODEL_5 = "gpt-5-2025-08-07"
MODEL_5_MINI = "gpt-5-mini-2025-08-07"
db_name = "my_projects_vector_db"
dev_name = "LoÃ¯c"

In [None]:
# Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

##### Now It's time to load the data

In [None]:
# for each items load the documents from the 'path' key depending on the file type markdown or pdf
def load_documents(item) -> list[Document]:
    type = item.get("type")
    if type == "markdown":
        loader = TextLoader(path)
    elif type == "pdf":
        loader = PyPDFLoader(path)
    else:
        raise ValueError(f"Unsupported file type: {path}")
    documents = loader.load()

    # Add metadata to each document
    metadata = item.get("metadata", {})

    # The metadata technologies is a list, convert it to a comma-separated string
    if "technologies" in metadata and isinstance(metadata["technologies"], list):
        metadata["technologies"] = ", ".join(metadata["technologies"])

    for doc in documents:
        doc.metadata.update(metadata)

    return documents

documents = []
for item in tqdm(data, desc="Loading documents", unit="item"):
    path = item.get("path")
    type = item.get("type")
    if path and type:
        try:
            documents.extend(load_documents(item))
            # print(f"Loaded {len(documents)} documents from {path}")
        except Exception as e:
            print(f"Error loading documents from {path}: {e}")
    else:
        print(f"No valid path or type for item: {item['name']}")

print(f"Total documents loaded: {len(documents)}")

In [None]:
# Split the documents into smaller chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)
print(f"Number of document chunks: {len(docs)}")

##### Let's do the embedding and store the vectors in a ChromaDB database

In [None]:
embeddings = OpenAIEmbeddings()

# Delete existing ChromaDB database folder if it exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()
    print(f"Deleted existing database folder: {db_name}")

# Create and persist the ChromaDB database
vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=db_name)
print(f"Created ChromaDB with {vectordb._collection.count()} documents.")

##### Visualize the database

In [None]:
collection = vectordb._collection

result = collection.get(include=["embeddings", "metadatas", "documents"])
vectors = np.array(result["embeddings"])
metadatas = result["metadatas"]
documents = result["documents"]
purpose = [meta.get("purpose", "unknown") for meta in metadatas]
colors = {'hackathon': 'red', 'personal': 'blue', 'community': 'green', 'unknown': 'gray', 'client': 'orange', 'event': 'purple', 'learning': 'cyan', 'codecanyon': 'brown'}

In [None]:
# Reduce dimensions with t-SNE
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Creqte the 2D scatter plot
fig = go.Figure()
for p in set(purpose):
    idx = [i for i, purp in enumerate(purpose) if purp == p]
    fig.add_trace(go.Scatter(
        x=reduced_vectors[idx, 0],
        y=reduced_vectors[idx, 1],
        mode='markers',
        marker=dict(color=colors.get(p, 'black'), size=5),
        name=p,
        text=[f"Doc: {documents[i][:30]}...<br>Purpose: {purpose[i]}" for i in idx],
        hoverinfo='text'
    ))

fig.update_layout(title='2D Document Embeddings Visualization')
fig.show()

In [None]:
# Let's try 3D visualization
tsne_3d = TSNE(n_components=3, random_state=42)
reduced_vectors_3d = tsne_3d.fit_transform(vectors)

fig_3d = go.Figure()
for p in set(purpose):
    idx = [i for i, purp in enumerate(purpose) if purp == p]
    fig_3d.add_trace(go.Scatter3d(
        x=reduced_vectors_3d[idx, 0],
        y=reduced_vectors_3d[idx, 1],
        z=reduced_vectors_3d[idx, 2],
        mode='markers',
        marker=dict(color=colors.get(p, 'black'), size=5),
        name=p,
        text=[f"Doc: {documents[i][:30]}...<br>Purpose: {purpose[i]}" for i in idx],
        hoverinfo='text',
    ))

fig_3d.update_layout(
    title='3D Document Embeddings Visualization',
    scene=dict(
        xaxis=dict(title='x', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
        yaxis=dict(title='y', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
        zaxis=dict(title='z', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
    ),
    paper_bgcolor='#1e1e1e',  # fond principal plus doux
    plot_bgcolor='#1e1e1e',
    font=dict(color='white'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40),
)
fig_3d.show()

In [None]:
#Now do a chart by categories, there are 2: About Me and Project
category = [meta.get("category", "unknown") for meta in metadatas]
colors_category = {'About Me': 'green', 'Project': 'magenta', 'unknown': 'gray'}
fig_category = go.Figure()
for c in set(category):
    idx = [i for i, cat in enumerate(category) if cat == c]
    fig_category.add_trace(go.Scatter(
        x=reduced_vectors[idx, 0],
        y=reduced_vectors[idx, 1],
        mode='markers',
        marker=dict(color=colors_category.get(c, 'black'), size=5),
        name=c,
        text=[f"Doc: {documents[i][:30]}...<br>Category: {category[i]}" for i in idx],
        hoverinfo='text'
    ))

fig_category.update_layout(title='2D Document Embeddings by Category')
fig_category.show()

In [None]:
# Category in 3D
fig_category_3d = go.Figure()
for c in set(category):
    idx = [i for i, cat in enumerate(category) if cat == c]
    fig_category_3d.add_trace(go.Scatter3d(
        x=reduced_vectors_3d[idx, 0],
        y=reduced_vectors_3d[idx, 1],
        z=reduced_vectors_3d[idx, 2],
        mode='markers',
        marker=dict(color=colors_category.get(c, 'black'), size=5),
        name=c,
        text=[f"Doc: {documents[i][:30]}...<br>Category: {category[i]}" for i in idx],
        hoverinfo='text',
    ))

fig_category_3d.update_layout(
    title='3D Document Embeddings by Category',
    scene=dict(
        xaxis=dict(title='x', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
        yaxis=dict(title='y', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
        zaxis=dict(title='z', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
    ),
    paper_bgcolor='#1e1e1e',  # fond principal plus doux
    plot_bgcolor='#1e1e1e',
    font=dict(color='white'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40),
)
fig_category_3d.show()

In [None]:
# Visualization by technologies (In metadata they are stored as a comma-separated string)
technologies = []
for meta in metadatas:
    techs = meta.get("technologies", "")
    if techs:
        technologies.extend([tech.strip() for tech in techs.split(",")])
    else:
        technologies.append("unknown")

# Get unique technologies and assign colors
unique_technologies = list(set(technologies))
color_palette = plt.get_cmap('tab20', len(unique_technologies))
tech_colors = {tech: f'rgb({int(color_palette(i)[0]*255)}, {int(color_palette(i)[1]*255)}, {int(color_palette(i)[2]*255)})' for i, tech in enumerate(unique_technologies)}
fig_tech = go.Figure()
for tech in unique_technologies:
    idx = [i for i, meta in enumerate(metadatas) if tech in meta.get("technologies", "")]
    if idx:
        fig_tech.add_trace(go.Scatter(
            x=reduced_vectors[idx, 0],
            y=reduced_vectors[idx, 1],
            mode='markers',
            marker=dict(color=tech_colors.get(tech, 'black'), size=5),
            name=tech,
            text=[f"Doc: {documents[i][:30]}...<br>Technologies: {metadatas[i].get('technologies', '')}" for i in idx],
            hoverinfo='text'
        ))

fig_tech.update_layout(title='2D Document Embeddings by Technologies')
fig_tech.show()

In [None]:
# The 3D version
fig_tech_3d = go.Figure()
for tech in unique_technologies:
    idx = [i for i, meta in enumerate(metadatas) if tech in meta.get("technologies", "")]
    if idx:
        fig_tech_3d.add_trace(go.Scatter3d(
            x=reduced_vectors_3d[idx, 0],
            y=reduced_vectors_3d[idx, 1],
            z=reduced_vectors_3d[idx, 2],
            mode='markers',
            marker=dict(color=tech_colors.get(tech, 'black'), size=5),
            name=tech,
            text=[f"Doc: {documents[i][:30]}...<br>Technologies: {metadatas[i].get('technologies', '')}" for i in idx],
            hoverinfo='text',
        ))

fig_tech_3d.update_layout(
    title='3D Document Embeddings by Technologies',
    scene=dict(
        xaxis=dict(title='x', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
        yaxis=dict(title='y', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
        zaxis=dict(title='z', backgroundcolor='#1e1e1e', gridcolor='#444', zerolinecolor='#666'),
    ),
    paper_bgcolor='#1e1e1e',  # fond principal plus doux
    plot_bgcolor='#1e1e1e',
    font=dict(color='white'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40),
)
fig_tech_3d.show()

##### Time to create the RAG chain with LangChain

In [None]:
llm = ChatOpenAI(model_name=MODEL, temperature=0.7, openai_api_key=OPENAI_API_KEY)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')

retriever = vectordb.as_retriever()

##### Time to create the RAG chain with LangChain
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True
)

In [None]:
# Try simple queries
query = "Is there any project where I used AI?"
result = conversation_chain.invoke({"question": query})
print(result['answer'])
# sources
for doc in result['source_documents']:
    print(f"- {doc.metadata.get('name', 'unknown')} ({doc.metadata.get('category', 'Unknown')})")

In [None]:
# Simple Chat interface with Gradio
def chat_with_rag(question, chat_history):
    result = conversation_chain.invoke({"question": question})
    answer = result['answer']
    return answer

iface = gr.ChatInterface(
    chat_with_rag,
    type="messages",
)
iface.launch(inbrowser=True)

### First project: a chat assistant about me and my projects

This is a simple chat assistant where users can ask questions about me and my projects. The answers are based on the data stored in the vector database.

In [None]:
prompt = PromptTemplate(
    input_variables=["context", "question", ],
    template=f"""You are an assistant called "LyabsInfo", specialized in answering questions about {dev_name}'s projects and background.
The one speaking to you is {dev_name}'s clients or potential clients. You have to look into his projects and background to answer their questions.

When you don't know the answer, say so and suggest other questions about {dev_name}'s projects and background.

The question to answer: {{question}}
Here is the context (translate it to the question language) you can use to answer the question: {{context}}

Answer in english or French depending on the language of the question. Do not  use 2 languages in the same answer.
"""
)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')

# retriever with filter on category
retriever = vectordb.as_retriever(
    # search_kwargs={"filter": {"category": "Project"}, "k": 10}
    search_kwargs={"k": 10}
)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": prompt},
    #callbacks=[StdOutCallbackHandler()],
)

In [None]:
# Simple question to test the new prompt
query = "I need a project on Firebase can he works on it?"
result = conversation_chain.invoke({"question": query})
print(result['answer'])

In [None]:
# Create a chat with Gradio
def chat_with_rag(question, chat_history):
    result = conversation_chain.invoke({"question": question})
    answer = result['answer']
    return answer

iface = gr.ChatInterface(
    chat_with_rag,
    type="messages",
)
iface.launch(inbrowser=True)

### Second Project: A troll chat bot for fun  ðŸ¤ª
An assistant that is discussing with me and respond with some punchline to mock my background and projects just for fun.

In [None]:
prompt = PromptTemplate(
    input_variables=["context", "question", ],
    template=f"""You are a troll chat bot just for fun. You are discussing with {dev_name}, the one that created you. based on
 the question: {{question}} and the context: {{context}}
Troll him with no mercy, be funny and creative. Use a lot of sarcasm. Make fun of his projects and background. Use a lot of humor with emojis.

Give a short answer.

"""
)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')

retriever = vectordb.as_retriever(
    search_kwargs={"k": 10}
)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": prompt},
    #callbacks=[StdOutCallbackHandler()],
)

In [None]:
# Simple question to test the new prompt
query = "Mon CV est extraordinaire n'est-ce pas? Reponds en francais stp."
result = conversation_chain.invoke({"question": query})
print(result['answer'])

In [None]:
# Create a chat with Gradio
def chat_with_rag(question, chat_history):
    result = conversation_chain.invoke({"question": question})
    answer = result['answer']
    return answer

iface = gr.ChatInterface(
    chat_with_rag,
    type="messages",
    title="Troll Chatbot ðŸ¤ª"
)
iface.launch(inbrowser=True)

### Third project: A bid generator from offer
Based on an offer text, generate a bid proposal text. From projects and background, the LLM will generate a professional bid proposal.

In [None]:
prompt = f"""
You are a professional bid proposal generator. Based on the offer text, generate a bid proposal text.
See what the offer is about and generate a professional bid as you are applying for a job as me {dev_name}, a freelance developer.

Here is the offer text: {{question}}

My background/projects: {{context}}

The bid should be in the same language as the offer text.

Do not mention information that is not in the context or the offer text. For example if the offer text is about a Flutter mobile app, do not talk about web development.
Do not estimate any price nor time or talk about features that are not in the offer text. You can mention examples of similar projects I did in the past if relevant.
Do not place headers or footers, just the bid text.
"""

llm = ChatOpenAI(model_name=MODEL_5_MINI, openai_api_key=OPENAI_API_KEY)

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')

retriever = vectordb.as_retriever(
    search_kwargs={"k": 10}
)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    return_source_documents=True,
    combine_docs_chain_kwargs={"prompt": PromptTemplate(input_variables=["context", "question"], template=prompt)},
    #callbacks=[StdOutCallbackHandler()],
)

In [None]:
# Simple offer text to test the new prompt
offer_text = """We are seeking a skilled developer to reskin a Flutter template app for both iOS and Android platforms that we have recently purchased. The ideal candidate will not only change the design elements but also ensure the app is set up correctly for deployment. Your expertise in Flutter and mobile app development will be crucial for this project. Please provide examples of previous app reskinning work in your application"""
result = conversation_chain.invoke({"question": offer_text})
print(result['answer'])

In [None]:
# UI with Gradio
def generate_bid(offer):
    result = conversation_chain.invoke({"question": offer})
    bid = result['answer']
    return bid

iface = gr.Interface(
    fn=generate_bid,
    inputs=gr.Textbox(
        lines=10,
        placeholder="Paste the offer text here...",
        label="Offer Text",
    ),
    outputs=gr.Textbox(
        lines=20,
        placeholder="The generated bid proposal will appear here...",
        label="Generated Bid Proposal",
    ),
    title="Bid Proposal Generator ðŸ¤–",
)
iface.launch(inbrowser=False)

#### Bid generator with option to choose the model

In [None]:
def init_rag_chain(model_name):
    llm = ChatOpenAI(model_name=model_name, openai_api_key=OPENAI_API_KEY)

    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key='answer')

    retriever = vectordb.as_retriever(
        search_kwargs={"k": 10}
    )

    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory,
        return_source_documents=True,
        combine_docs_chain_kwargs={"prompt": PromptTemplate(input_variables=["context", "question"], template=prompt)},
        #callbacks=[StdOutCallbackHandler()],
    )
    return conversation_chain

def generate_bid(offer, model):
    conversation_chain = init_rag_chain(model)
    result = conversation_chain.invoke({"question": offer})
    return result['answer']

iface = gr.Interface(
    fn=generate_bid,
    inputs=[
        gr.Textbox(
            lines=10,
            placeholder="Paste the offer text here...",
            label="Offer Text",
        ),
        gr.Dropdown(
            choices=[
                ("GPT-4o", MODEL_4O),
                ("GPT-4o Mini", MODEL),
                ("GPT-5", MODEL_5),
                ("GPT-5 Mini", MODEL_5_MINI),
            ],
            value=MODEL_5_MINI,
            label="Select Model",
        )
    ],
    outputs=gr.Textbox(
        lines=20,
        placeholder="The generated bid proposal will appear here...",
        label="Generated Bid Proposal",
        show_copy_button=True,
    ),
    title="Bid Proposal Generator ðŸ¤–",
)
iface.launch(inbrowser=False)