In [1]:
import json

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line as a JSON object
            data.append(json.loads(line))
    return data

# Usage
file_path = '../data/GES800/embeds/summaries_GES800.jsonl'  # Replace with the path to your JSONL file
contents = load_jsonl(file_path)
print(len(contents))
print(contents[0])
print(type(contents[0]))

text_contents = [slide['content'] for slide in contents]
print(len(text_contents))
print(type(text_contents))
print(type(text_contents[0]))
print(text_contents[0])



252
{'image_path': 'data/GES800/M01_Présentation_du_cours/page_18.png', 'module': 'M01_Présentation_du_cours', 'page': 'page_18.png', 'content': "La diapositive présente les informations concernant les modalités pour contacter la personne qui a créé la présentation. Elle indique qu'elle n'a pas de bureau à l'ÉTS (École de technologie supérieure) et qu'il est possible de la rencontrer sur rendez-vous avant le cours."}
<class 'dict'>
252
<class 'list'>
<class 'str'>
La diapositive présente les informations concernant les modalités pour contacter la personne qui a créé la présentation. Elle indique qu'elle n'a pas de bureau à l'ÉTS (École de technologie supérieure) et qu'il est possible de la rencontrer sur rendez-vous avant le cours.


In [2]:
import os
import cohere
import numpy as np
from dotenv import load_dotenv

load_dotenv()

model_name = "embed-multilingual-v3.0"
api_key: str | None = os.getenv("COHERE_API_KEY")
input_type_embed = "search_document"

# Now we'll set up the cohere client.
if api_key is None:
    raise ValueError("Please set the COHERE_API_KEY environment variable.")
co = cohere.Client(api_key)

# Get the embeddings
embeds: list[list[float]] = co.embed(
    texts=text_contents, model=model_name, input_type=input_type_embed
).embeddings

# print(embeds)
print(type(embeds), len(embeds), len(embeds[0]))
array_embeds = np.array(embeds)
print(array_embeds.shape)
np.save('../data/GES800/embeds/embeddings_GES800.npy', array_embeds)

<class 'list'> 252 1024
(252, 1024)


In [None]:
import numpy as np

def cosine_similarity_matrix(vectors, query_vec):
    dot_product = np.dot(vectors, query_vec)
    
    norms_vectors = np.linalg.norm(vectors, axis=1)
    norm_query_vec = np.linalg.norm(query_vec)
    
    if norm_query_vec == 0 or np.any(norms_vectors == 0):
        raise ValueError("Cosine similarity is not defined when one or both vectors are zero vectors.")
    
    similarity = dot_product / (norms_vectors * norm_query_vec)
    return similarity


In [None]:
import os
import cohere
from dotenv import load_dotenv

load_dotenv()

model_name = "embed-multilingual-v3.0"
api_key: str | None = os.getenv("COHERE_API_KEY")
input_type_embed = "search_query"

QUERY="Comment gérer un projet de développement de produits ?"

# Now we'll set up the cohere client.
if api_key is None:
    raise ValueError("Please set the COHERE_API_KEY environment variable.")
co = cohere.Client(api_key)

# Get the embeddings
query_embed: list[list[float]] = co.embed(
    texts=[QUERY], model=model_name, input_type=input_type_embed
).embeddings
query_array = np.array(query_embed)
query_array = query_array.reshape(-1)

embeds_dataset = np.load('../data/GES800/embeds/embeddings_GES800.npy')

similarity_results = np.zeros((embeds_dataset.shape[0],), dtype=np.float32)
try:
    similarity_results = cosine_similarity_matrix(embeds_dataset, query_array)
except ValueError as e:
    print(e)

sorted_indices = np.argsort(similarity_results)[::-1]
relevant_contents: list[dict] = [contents[i] for i in sorted_indices[:5]]
print(relevant_contents)
for i in range(8):
    print(similarity_results[sorted_indices[i]], contents[sorted_indices[i]])

# -----------------------------------------------
from openai import OpenAI
def extract_tags_function_call(input: str, model: str, max_retries: int = 1):
    client = OpenAI()
    try:
        extracted_details = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a world class teacher in the field of management and supervision of teams working on projects. You will answer questions asked by students. You will use theory provided to you in the form of slides.",
                },
                {
                    "role": "user",
                    "content": input,
                },
            ],
            max_tokens=1000,
        )
        return extracted_details

    except Exception as e:
        print(e)
        return f"Error generating post with the OpenAI API: {e}"

for d in relevant_contents:
    d.pop("image_path", None)

input_prompt = f"REMEMBER TO USE THE THEORY GIVEN TO YOU. But not all of it might be relevant to the question. Here is the student's question: {QUERY}\n, and the relevant theory: {relevant_contents}. Give a clear answer that helps the student understand the course material. Cite the sources you used to answer the question. Cite the module and the page you used."
answer = extract_tags_function_call(input_prompt, "gpt-3.5-turbo-0125")
print('\n')
print(answer.choices[0].message.content)