# Chapter content links analysis

In [182]:
# LIBRARIES IMPORT
import sys
import numpy as np

import re
import umap
import plotly as py
from openai import OpenAI
from dotenv import load_dotenv
import plotly.graph_objs as go
from sklearn.manifold import TSNE

In [183]:
# Modules import
sys.path.append('../') # Add parent directory to the path
from create_database import load_documents
from query_chatbot import calculate_nb_tokens

## Settings

Load OpenAI API key

In [184]:
load_dotenv()

True

In [185]:
CHAPTERS_PATH = "/home/essmay/Bureau/biopyassistant-sandbox/data/markdown_processed"

In [186]:
# Initialize OpenAI API
client = OpenAI()
# Define the embdedding model
EMBEDDING_MODEL = "text-embedding-3-large"
MAX_TOKENS = int(8191) 

## Tooling

In [187]:
def get_chapters_content():
    """ Get the content of the chapters from the markdown files, 
    and return a dictionary with the chapter name as key and the content as value.
    """
    chapters = {}
    # Load the documents
    documents = load_documents(CHAPTERS_PATH)

    # Remove annexes from the list of documents
    documents_cleaned = [doc for doc in documents if not doc.metadata.get("source", "").split("/")[-1].split(".")[0].startswith("annexe")]

    for document in documents_cleaned:
        chapter_content = ""
        chapter_name = document.metadata.get("source", "").split("/")[-1].split(".")[0]
        # Get only content without exercises
        content_lines = document.page_content.splitlines()
        for line in content_lines:
            if re.match(r"##\s+[\d.]+\s+Exercices", line):
                break  # Stop extracting content if exercise section starts
            chapter_content += line + "\n"  # Add line to chapter content
        chapter_tokens = calculate_nb_tokens(chapter_content)
        if chapter_tokens <= MAX_TOKENS:
            print(f"Chapter {chapter_name} has {chapter_tokens} tokens.\n")
            chapters[chapter_name] = chapter_content
        else:
            print(f"Chapter {chapter_name} has {chapter_tokens} tokens. Limit is {MAX_TOKENS} tokens. {chapter_tokens - MAX_TOKENS} tokens will be ignored.\n")
            chapters[chapter_name] = chapter_content[:MAX_TOKENS*3-700]
  
    return chapters

In [188]:
def get_embeddings_raw(text: str, model: str = EMBEDDING_MODEL) -> str:
    """Get raw embeddings from OpenAi."""
    return client.embeddings.create(
           input=[text],
           model=model
        ).data[0].embedding

In [189]:
def get_embeddings(chapters: dict):
    embeddings = {}
    for chapter_name, content in chapters.items():
        embedding_chapter = get_embeddings_raw(content)
        embeddings[chapter_name] = embedding_chapter

    return embeddings

In [190]:
def interpolate_color(start_color, end_color, factor: float):
    """
    Interpolates between two colors.
    
    :param start_color: Tuple of RGB values for the start color.
    :param end_color: Tuple of RGB values for the end color.
    :param factor: How far to interpolate between the colors (0.0 to 1.0).
    :return: Interpolated color as an 'rgba' string.
    """
    return f'rgba({int(start_color[0] + (end_color[0] - start_color[0]) * factor)},' \
           f'{int(start_color[1] + (end_color[1] - start_color[1]) * factor)},' \
           f'{int(start_color[2] + (end_color[2] - start_color[2]) * factor)}, 1)'


In [191]:
def plot_embeddings_2D(chapters, embeddings_2d):
    """ Plot the 2D embeddings of the chapters with plotly.

    Parameters:
    -----------
        chapters (list): List of chapter names.
        embeddings_2d (numpy.ndarray): 2D embeddings of the chapters.
    
    Returns:
    --------
        fig (plotly.graph_objs.Figure): Plotly figure object.
    """
    # Define start and end colors for the gradient (light blue to dark blue)
    start_color = (238, 176, 238, 1)   # Light pink
    end_color = (3, 3, 117, 1)   # Dark blue
    num_chapters = len(chapters)

    # interactive plot with plotly
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers+text',
        text=chapters,
        textposition='top center',
        marker=dict(size=10, color='gray')
    ))

    # add line to connect the points
    for i in range(len(chapters) - 1):
        color = interpolate_color(start_color, end_color, i / (num_chapters - 1))
        fig.add_trace(go.Scatter(
            x=[embeddings_2d[i, 0], embeddings_2d[i + 1, 0]],
            y=[embeddings_2d[i, 1], embeddings_2d[i + 1, 1]],
            mode='lines',
            line=dict(color=color, width=3)
        ))

    # Add gradient legend
    gradient_colors = [interpolate_color(start_color, end_color, i / (num_chapters - 1)) for i in range(num_chapters)]
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode='markers',
        marker=dict(
            colorscale=gradient_colors,
            cmin=0,
            cmax=num_chapters - 1,
            colorbar=dict(
                title='Chapters',
                tickvals=list(range(num_chapters)),
                ticktext=chapters,
                lenmode='fraction',
                len=1,
                yanchor='middle',
                y=0.5
            )
        ),
        showlegend=False
    ))

    return fig


In [192]:
def visualize_embeddings_tsne(embeddings_dict: dict):
    # Extract the chapters names and embeddings
    chapters = list(embeddings_dict.keys())
    embeddings = np.array(list(embeddings_dict.values()))
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, perplexity=(len(chapters) - 1), random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)
    
    # interactive plot with plotly
    fig = plot_embeddings_2D(chapters, embeddings_2d)
    fig.update_layout(title='t-SNE Visualization of Chapter Embeddings', showlegend=False, 
                       plot_bgcolor='rgba(0,0,0,0)',  # Remove plot background
                       # Hide gridlines and zeroline
                       xaxis=dict(showgrid=False, zeroline=False),  
                       yaxis=dict(showgrid=False, zeroline=False),
                       # Set plot aspect ratio to be square
                       width=1200,
                       height=1000)
    fig.show()

In [193]:
def visualize_embeddings_umap(embeddings_dict: dict):
    # Extract the chapters names and embeddings
    chapters = list(embeddings_dict.keys())
    embeddings = np.array(list(embeddings_dict.values()))
    
    # Apply umap
    reducer = umap.UMAP(n_components=2, random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)

    # interactive plot with plotly
    fig = plot_embeddings_2D(chapters, embeddings_2d)
    fig.update_layout(title='UMAP Visualization of Chapter Embeddings', showlegend=False,
                    plot_bgcolor='rgba(0,0,0,0)',  # Remove plot background
                    # Hide gridlines and zeroline
                    xaxis=dict(showgrid=False, zeroline=False),  
                    yaxis=dict(showgrid=False, zeroline=False),
                    # Set plot aspect ratio to be square
                    width=1200,
                    height=1000)
    fig.show()

## Analysis

In [194]:
# Get the content of all the chapters in a dictionary
chapters = get_chapters_content()

[32m2024-07-08 17:18:31.851[0m | [1mINFO    [0m | [36mcreate_database[0m:[36mload_documents[0m:[36m157[0m - [1mLoading Markdown documents...[0m
100%|██████████| 29/29 [00:00<00:00, 11156.09it/s]
[32m2024-07-08 17:18:31.857[0m | [32m[1mSUCCESS [0m | [36mcreate_database[0m:[36mload_documents[0m:[36m166[0m - [32m[1mMarkdown document loading complete.
[0m


Chapter 01_introduction has 4583 tokens.

Chapter 02_variables has 5072 tokens.

Chapter 03_affichage has 4455 tokens.

Chapter 04_listes has 4323 tokens.

Chapter 05_boucles_comparaisons has 4501 tokens.

Chapter 06_tests has 2901 tokens.

Chapter 07_fichiers has 3667 tokens.

Chapter 08_dictionnaires_tuples has 5835 tokens.

Chapter 09_modules has 6176 tokens.

Chapter 10_fonctions has 5547 tokens.

Chapter 11_plus_sur_les_chaines_de_caracteres has 6050 tokens.

Chapter 12_plus_sur_les_listes has 6244 tokens.

Chapter 13_plus_sur_les_fonctions has 4859 tokens.

Chapter 14_conteneurs has 13625 tokens. Limit is 8191 tokens. 5434 tokens will be ignored.

Chapter 15_creation_modules has 3002 tokens.

Chapter 16_bonnes_pratiques has 9267 tokens. Limit is 8191 tokens. 1076 tokens will be ignored.

Chapter 17_expressions_regulieres has 5130 tokens.

Chapter 18_jupyter has 4204 tokens.

Chapter 19_module_biopython has 2825 tokens.

Chapter 20_module_numpy has 13142 tokens. Limit is 8191 toke

In [195]:
# get the embeddings of the chapters
embeddings = get_embeddings(chapters)
embeddings

{'01_introduction': [-0.010491590015590191,
  -0.003078479552641511,
  -0.004618544597178698,
  0.055462151765823364,
  0.005612241569906473,
  -0.03008161298930645,
  0.025855926796793938,
  0.04719564691185951,
  -0.028285697102546692,
  0.031877532601356506,
  -0.010293510742485523,
  -0.0051104407757520676,
  0.03758220747113228,
  -0.039325304329395294,
  0.00691956328228116,
  0.010247292928397655,
  0.016651850193738937,
  0.02781030721962452,
  -0.025499382987618446,
  -0.029025191441178322,
  0.015252090990543365,
  -0.016414156183600426,
  -0.051236461848020554,
  0.035390134900808334,
  0.0031313004437834024,
  -0.002507351338863373,
  0.005955578293651342,
  -0.0012239638017490506,
  0.028470570221543312,
  -0.03755579888820648,
  -0.007645853329449892,
  -0.020032400265336037,
  -0.009098433889448643,
  0.014895549044013023,
  0.0027301902882754803,
  -0.004291714169085026,
  0.02841774933040142,
  -0.002939824014902115,
  0.004595435224473476,
  0.005496695172041655,
  0.

In [196]:
visualize_embeddings_tsne(embeddings)

In [197]:
visualize_embeddings_umap(embeddings)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

