# Chapter content links analysis

In [79]:
# LIBRARIES IMPORT
import sys
import pandas as pd
import numpy as np



import plotly as py
from openai import OpenAI
from dotenv import load_dotenv
import plotly.graph_objs as go
from sklearn.manifold import TSNE
import umap

In [78]:
reducer = umap.UMAP()


AttributeError: module 'umap' has no attribute 'UMAP'

In [40]:
# Modules import
sys.path.append('../') # Add parent directory to the path
from create_database import load_documents
from query_chatbot import calculate_nb_tokens

## Settings

Load OpenAI API key

In [9]:
load_dotenv()

True

In [24]:
CHAPTERS_PATH = "/home/tess01hp/Desktop/biopyassistant-sandbox/data/markdown_processed"

/home/tess01hp/Desktop/biopyassistant-sandbox/data/markdown_processed


In [52]:
# Initialize OpenAI API
client = OpenAI()
# Define the embdedding model
EMBEDDING_MODEL = "text-embedding-3-large"
MAX_TOKENS = int(8191) 

## Tooling

In [57]:
def get_chapters_content():
    """ Get the content of the chapters from the markdown files, 
    and return a dictionary with the chapter name as key and the content as value.
    """
    chapters = {}
    # Load the documents
    documents = load_documents(CHAPTERS_PATH)

    for document in documents:
        chapter_name = document.metadata.get("source", "").split("/")[-1].split(".")[0]
        chapter_content = document.page_content
        chapter_tokens = calculate_nb_tokens(chapter_content)
        if chapter_tokens <= MAX_TOKENS:
            print(f"Chapter {chapter_name} has {chapter_tokens} tokens.\n")
            chapters[chapter_name] = chapter_content
        else:
            print(f"Chapter {chapter_name} has {chapter_tokens} tokens, so {chapter_tokens - MAX_TOKENS} tokens more than the limit. It will be ignored.\n")
  
    return chapters

In [33]:
def get_embeddings_raw(text: str, model: str = EMBEDDING_MODEL) -> str:
    """Get raw embeddings from OpenAi."""
    return client.embeddings.create(
           input=[text],
           model=model
        ).data[0].embedding

In [34]:
def get_embeddings(chapters: dict):
    embeddings = {}
    for chapter_name, content in chapters.items():
        embedding_chapter = get_embeddings_raw(content)
        embeddings[chapter_name] = embedding_chapter

    return embeddings

In [70]:
def visualize_embeddings_tsne(embeddings_dict: dict):
    # Extract the chapters names and embeddings
    chapters = list(embeddings_dict.keys())
    embeddings = np.array(list(embeddings_dict.values()))
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, perplexity=(len(chapters) - 1), random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)
    
    # interactive plot with plotly
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers+text',
        text=chapters,
        textposition='top center',
        marker=dict(size=10, color='blue')
    ))

    # add line to connect the points
    for i in range(len(chapters) - 1):
        fig.add_trace(go.Scatter(
            x=[embeddings_2d[i, 0], embeddings_2d[i + 1, 0]],
            y=[embeddings_2d[i, 1], embeddings_2d[i + 1, 1]],
            mode='lines',
            line=dict(color='gray', width=2)
        ))

    fig.update_layout(title='t-SNE Visualization of Chapter Embeddings')
    fig.show()

In [72]:
def visualize_embeddings_umap(embeddings_dict: dict):
    # Extract the chapters names and embeddings
    chapters = list(embeddings_dict.keys())
    embeddings = np.array(list(embeddings_dict.values()))
    
    # Apply t-SNE
    reducer = umap.UMAP(n_components=2, random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)
    
    # interactive plot with plotly
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=embeddings_2d[:, 0],
        y=embeddings_2d[:, 1],
        mode='markers+text',
        text=chapters,
        textposition='top center',
        marker=dict(size=10, color='blue')
    ))

    # add line to connect the points
    for i in range(len(chapters) - 1):
        fig.add_trace(go.Scatter(
            x=[embeddings_2d[i, 0], embeddings_2d[i + 1, 0]],
            y=[embeddings_2d[i, 1], embeddings_2d[i + 1, 1]],
            mode='lines',
            line=dict(color='gray', width=2)
        ))

    fig.update_layout(title='UMAP Visualization of Chapter Embeddings')
    fig.show()

## Analysis

In [58]:
# Get the content of all the chapters in a dictionary
chapters = get_chapters_content()

2024-07-07 23:45:05.691 | INFO     | create_database:load_documents:157 - Loading Markdown documents...
100%|██████████| 29/29 [00:00<00:00, 3752.66it/s]
2024-07-07 23:45:05.708 | SUCCESS  | create_database:load_documents:166 - Markdown document loading complete.



Chapter 01_introduction has 4583 tokens.

Chapter 02_variables has 5522 tokens.

Chapter 03_affichage has 5268 tokens.

Chapter 04_listes has 4834 tokens.

Chapter 05_boucles_comparaisons has 6790 tokens.

Chapter 06_tests has 5628 tokens.

Chapter 07_fichiers has 5008 tokens.

Chapter 08_dictionnaires_tuples has 6944 tokens.

Chapter 09_modules has 8315 tokens, so 124 tokens more than the limit. It will be ignored.

Chapter 10_fonctions has 7856 tokens.

Chapter 11_plus_sur_les_chaines_de_caracteres has 10684 tokens, so 2493 tokens more than the limit. It will be ignored.

Chapter 12_plus_sur_les_listes has 7503 tokens.

Chapter 13_plus_sur_les_fonctions has 5347 tokens.

Chapter 14_conteneurs has 16072 tokens, so 7881 tokens more than the limit. It will be ignored.

Chapter 15_creation_modules has 3002 tokens.

Chapter 16_bonnes_pratiques has 9267 tokens, so 1076 tokens more than the limit. It will be ignored.

Chapter 17_expressions_regulieres has 6639 tokens.

Chapter 18_jupyter ha

In [65]:
# get the embeddings of the chapters
embeddings = get_embeddings(chapters)
embeddings

{'01_introduction': [-0.010366996750235558,
  -0.003003922989591956,
  -0.0046987817622721195,
  0.05534340441226959,
  0.005645133089274168,
  -0.030045824125409126,
  0.025877922773361206,
  0.04690208658576012,
  -0.02835755981504917,
  0.03176046907901764,
  -0.010320832952857018,
  -0.005035115871578455,
  0.03745836019515991,
  -0.03906748443841934,
  0.00704322662204504,
  0.010347211733460426,
  0.016777124255895615,
  0.02798825316131115,
  -0.025561373680830002,
  -0.02917531318962574,
  0.015313082374632359,
  -0.01646057516336441,
  -0.05114912614226341,
  0.03545354679226875,
  0.0034523680806159973,
  -0.002393905771896243,
  0.006040819920599461,
  -0.0014178779674693942,
  0.028252044692635536,
  -0.03751111775636673,
  -0.007828005589544773,
  -0.020219599828124046,
  -0.009384374134242535,
  0.014745931141078472,
  0.0028604865074157715,
  -0.004200875759124756,
  0.028331181034445763,
  -0.003106142161414027,
  0.004448180086910725,
  0.005509939976036549,
  0.000714

In [71]:
visualize_embeddings_tsne(embeddings)

In [73]:
visualize_embeddings_umap(embeddings)

AttributeError: module 'umap' has no attribute 'UMAP'