# Chapter content links analysis

In [63]:
# LIBRARIES IMPORT
import sys
import numpy as np

import umap
import plotly as py
from openai import OpenAI
from dotenv import load_dotenv
import plotly.graph_objs as go
from sklearn.manifold import TSNE

In [41]:
# Modules import
sys.path.append('../') # Add parent directory to the path
from create_database import load_documents
from query_chatbot import calculate_nb_tokens

## Settings

Load OpenAI API key

In [42]:
load_dotenv()

True

In [43]:
CHAPTERS_PATH = "/home/essmay/Bureau/biopyassistant-sandbox/data/markdown_processed"

In [44]:
# Initialize OpenAI API
client = OpenAI()
# Define the embdedding model
EMBEDDING_MODEL = "text-embedding-3-large"
MAX_TOKENS = int(8191) 

## Tooling

In [45]:
def get_chapters_content():
    """ Get the content of the chapters from the markdown files, 
    and return a dictionary with the chapter name as key and the content as value.
    """
    chapters = {}
    # Load the documents
    documents = load_documents(CHAPTERS_PATH)

    for document in documents:
        chapter_name = document.metadata.get("source", "").split("/")[-1].split(".")[0]
        chapter_content = document.page_content
        chapter_tokens = calculate_nb_tokens(chapter_content)
        if chapter_tokens <= MAX_TOKENS:
            print(f"Chapter {chapter_name} has {chapter_tokens} tokens.\n")
            chapters[chapter_name] = chapter_content
        else:
            print(f"Chapter {chapter_name} has {chapter_tokens} tokens, so {chapter_tokens - MAX_TOKENS} tokens more than the limit. It will be ignored.\n")
  
    return chapters

In [46]:
def get_embeddings_raw(text: str, model: str = EMBEDDING_MODEL) -> str:
    """Get raw embeddings from OpenAi."""
    return client.embeddings.create(
           input=[text],
           model=model
        ).data[0].embedding

In [47]:
def get_embeddings(chapters: dict):
    embeddings = {}
    for chapter_name, content in chapters.items():
        embedding_chapter = get_embeddings_raw(content)
        embeddings[chapter_name] = embedding_chapter

    return embeddings

In [48]:
def interpolate_color(start_color, end_color, factor: float):
    """
    Interpolates between two colors.
    
    :param start_color: Tuple of RGB values for the start color.
    :param end_color: Tuple of RGB values for the end color.
    :param factor: How far to interpolate between the colors (0.0 to 1.0).
    :return: Interpolated color as an 'rgba' string.
    """
    return f'rgba({int(start_color[0] + (end_color[0] - start_color[0]) * factor)},' \
           f'{int(start_color[1] + (end_color[1] - start_color[1]) * factor)},' \
           f'{int(start_color[2] + (end_color[2] - start_color[2]) * factor)}, 1)'


In [60]:
def plot_embeddings_2D(chapters, embeddings_2d):
    # Define start and end colors for the gradient (light blue to dark blue)
    start_color = (173, 216, 230)  # Light blue
    end_color = (0, 0, 139)        # Dark blue
    num_chapters = len(chapters)

    # interactive plot with plotly
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers+text',
        text=chapters,
        textposition='top center',
        marker=dict(size=10, color='gray')
    ))

    # add line to connect the points
    for i in range(len(chapters) - 1):
        color = interpolate_color(start_color, end_color, i / (num_chapters - 1))
        fig.add_trace(go.Scatter(
            x=[embeddings_2d[i, 0], embeddings_2d[i + 1, 0]],
            y=[embeddings_2d[i, 1], embeddings_2d[i + 1, 1]],
            mode='lines',
            line=dict(color=color, width=2)
        ))

    # Add gradient legend
    gradient_colors = [interpolate_color(start_color, end_color, i / (num_chapters - 1)) for i in range(num_chapters)]
    fig.add_trace(go.Scatter(
        x=[None], y=[None], mode='markers',
        marker=dict(
            colorscale=gradient_colors,
            cmin=0,
            cmax=num_chapters - 1,
            colorbar=dict(
                title='Chapters',
                tickvals=list(range(num_chapters)),
                ticktext=chapters,
                lenmode='fraction',
                len=1,
                yanchor='middle',
                y=0.5
            )
        ),
        showlegend=False
    ))

    return fig
    

In [56]:
def visualize_embeddings_tsne(embeddings_dict: dict):
    # Extract the chapters names and embeddings
    chapters = list(embeddings_dict.keys())
    embeddings = np.array(list(embeddings_dict.values()))
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, perplexity=(len(chapters) - 1), random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)
    
    # interactive plot with plotly
    fig = plot_embeddings_2D(chapters, embeddings_2d)
    fig.update_layout(title='t-SNE Visualization of Chapter Embeddings', showlegend=False)
    fig.show()

In [51]:
def visualize_embeddings_umap(embeddings_dict: dict):
    # Extract the chapters names and embeddings
    chapters = list(embeddings_dict.keys())
    embeddings = np.array(list(embeddings_dict.values()))
    
    # Apply umap
    reducer = umap.UMAP(n_components=2, random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)

    # interactive plot with plotly
    fig = plot_embeddings_2D(chapters, embeddings_2d)
    fig.update_layout(title='UMAP Visualization of Chapter Embeddings', showlegend=False)
    fig.show()

## Analysis

In [52]:
# Get the content of all the chapters in a dictionary
chapters = get_chapters_content()

[32m2024-07-08 12:54:26.077[0m | [1mINFO    [0m | [36mcreate_database[0m:[36mload_documents[0m:[36m157[0m - [1mLoading Markdown documents...[0m
100%|██████████| 29/29 [00:00<00:00, 11001.70it/s]
[32m2024-07-08 12:54:26.083[0m | [32m[1mSUCCESS [0m | [36mcreate_database[0m:[36mload_documents[0m:[36m166[0m - [32m[1mMarkdown document loading complete.
[0m


Chapter 01_introduction has 4583 tokens.

Chapter 02_variables has 5522 tokens.

Chapter 03_affichage has 5268 tokens.

Chapter 04_listes has 4834 tokens.

Chapter 05_boucles_comparaisons has 6790 tokens.

Chapter 06_tests has 5628 tokens.

Chapter 07_fichiers has 5008 tokens.

Chapter 08_dictionnaires_tuples has 6944 tokens.

Chapter 09_modules has 8315 tokens, so 124 tokens more than the limit. It will be ignored.

Chapter 10_fonctions has 7856 tokens.

Chapter 11_plus_sur_les_chaines_de_caracteres has 10684 tokens, so 2493 tokens more than the limit. It will be ignored.

Chapter 12_plus_sur_les_listes has 7503 tokens.

Chapter 13_plus_sur_les_fonctions has 5347 tokens.

Chapter 14_conteneurs has 16072 tokens, so 7881 tokens more than the limit. It will be ignored.

Chapter 15_creation_modules has 3002 tokens.

Chapter 16_bonnes_pratiques has 9267 tokens, so 1076 tokens more than the limit. It will be ignored.

Chapter 17_expressions_regulieres has 6639 tokens.

Chapter 18_jupyter ha

In [53]:
# get the embeddings of the chapters
embeddings = get_embeddings(chapters)
embeddings

{'01_introduction': [-0.01025866437703371,
  -0.003038320690393448,
  -0.004657328128814697,
  0.05550505220890045,
  0.0055518257431685925,
  -0.03004983440041542,
  0.02581169828772545,
  0.047160811722278595,
  -0.028227832168340683,
  0.03195105493068695,
  -0.010351085104048252,
  -0.004947792273014784,
  0.037628307938575745,
  -0.03934468701481819,
  0.006964537315070629,
  0.010278468951582909,
  0.016662077978253365,
  0.027858149260282516,
  -0.02548162452876568,
  -0.0291520357131958,
  0.015341786667704582,
  -0.016437629237771034,
  -0.05125371366739273,
  0.03551584109663963,
  0.0032677212730050087,
  -0.0025498128961771727,
  0.005957815330475569,
  -0.0012567524099722505,
  0.028412671759724617,
  -0.03736424818634987,
  -0.0076048788614571095,
  -0.019909994676709175,
  -0.009103409945964813,
  0.014760858379304409,
  0.002698345575481653,
  -0.0043074507266283035,
  0.02825423702597618,
  -0.0028930886182934046,
  0.004680433310568333,
  0.005505615379661322,
  0.000

In [61]:
visualize_embeddings_tsne(embeddings)

In [62]:
visualize_embeddings_umap(embeddings)


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

