# RAG applied to education

#### First of all, mount the Google drive content.

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


#### Install the dependencies.

In [2]:
!pip install langchain
!pip install openai
!pip install tiktoken
!pip install faiss-gpu
!pip install langchain_experimental
!pip install "langchain[docarray]"
!pip install PyPDF2;

Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/817.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/817.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.32 (from langchain)
  Downloading langchain_community-0.0.34-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.42 (from langchain)
  Downl

#### Import the dependencies.

In [3]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.indexes import VectorstoreIndexCreator
from langchain_experimental.agents.agent_toolkits.csv.base import create_csv_agent
from langchain.agents.agent_types import AgentType
from langchain.memory import ConversationBufferMemory
import tiktoken
from langchain.text_splitter import CharacterTextSplitter


#### Handle the saving of the API Key of OpenAI

In [None]:
import os

# Prompt the user for their OpenAI API key
api_key = input("Please enter your OpenAI API key: ")

# Set the API key as an environment variable
os.environ["OPENAI_API_KEY"] = api_key

# Optionally, check that the environment variable was set correctly
print("OPENAI_API_KEY has been set!")
llm_model = "gpt-4-turbo"

#### Create the function to read the content of a PDF file containing the knowledge base external to the model where RAG needs to retrieve the relevant information. This function loads the text, dividing it into chunks and embedding them, finally inserting them into a vector store. Another function is implemented to create the conversation chain.

In [5]:
# import a PDF file and convert it to text
import PyPDF2
def pdf_to_vectorstore(pdf_path):
    with open(pdf_file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    txt_file_path = f'/content/drive/MyDrive/{name}.txt'
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
      txt_file.write(text)
    loader = TextLoader(file_path=txt_file_path, encoding="utf-8")
    data = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_text = text_splitter.split_documents(data)
    print(len(split_text[0].page_content))
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(split_text, embedding=embeddings)
    return vectorstore

def gen_chain(vectorstore):
    llm = ChatOpenAI(temperature=0.35, model_name="gpt-4-turbo")
    memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(),
        memory=memory
        )

#### Next, a document related to a university course called "Advanced Operating Systems (AOS)" is taken in input, and it is saved as vector store.

In [None]:
name = "aos"
pdf_file_path = f"/content/drive/MyDrive/{name}.pdf"
vectorstore = pdf_to_vectorstore(pdf_file_path)

#### The conversation chain related to the previous knowledge base contained into the vector store is created.

In [None]:
# Create conversation chain
conversation_chain = gen_chain(vectorstore)

## Example 1:

#### The goal is to create a mind map related to a specific knowledge base. The first step is to give a prompt that tells the LLM to be consistent with the mind map in output by keeping the same structure for all the outputs.

#### The the LLM is asked to summarise the knowledge base into a mind map as JSON.

In [8]:
query = "Be always consistent with the JSON representation of data you provide me with."
result = conversation_chain({"question": query})
# discard answer ...
query = "Summarize the file into a mindmap, generating it in JSON format. Clearly separate different topics."
result = conversation_chain({"question": query})
answer = result["answer"]

#### The JSON is then adjusted with respect to its structure.

In [10]:
print(map)
map = answer[answer.index('{') : len(answer) - answer[::-1].index('}') ]
import json
def print_json_tree(data, indent=0):
    if isinstance(data, dict):
        for key, value in data.items():
            # if(indent!=0):
            print('    ' * indent + '└── ' + str(key))
            # else:
            # print(str(key))
            print_json_tree(value, indent + 1)
    elif isinstance(data, list):
        for item in data:
            print_json_tree(item, indent)
    else:
        # if(indent!=0):
        print('    ' * indent + '└── ' + str(data))
        # else:
            # print('    ' * indent + str(data))

data = json.loads(map)

print_json_tree(data,0)

## Example 2:

#### A profilation is performed by taking various information of a person. Passing it to the RAG and LLM it is possible to create a mind map in JSON related to the profilation of the user.

In [None]:
age = 20   ## PROFILE: discuss about it at the beginning of the conversation
query = f"Suggest a method for a {age}-year-old student to effectively learn what the file is about. Return a list, with bulletpoints and comments, in JSON." # LASCO
result = conversation_chain({"question": query})
answer = result["answer"]
print(answer)

## Example 3:

#### The RAG is performed by asking the LLM to create a specific learning path for the knowledge base (AOS university course) following some information like the current user enrollment, that in this case is university.

In [11]:
# LEARNING JOURNEY --> POSSIBILE ESTENSIONE FUTURA: integriamo testi che parlano di disturbi dell'apprendimento. Facciamo learning journeys tailor-made per gli studenti con difficoltà.
# We did prompt engineering to ensure the generation of answers in JSON format. This is very important to realize the integration.
schoolOrJob = "university"
query = f"Propose a learning journey for a university student to learn about the topics in the document. Specify timeline information. Assume the student has little or no prior knowledge about the topics in the document. Return the result in a JSON."
result = conversation_chain({"question": query})
answer = result["answer"]
print(answer)

  warn_deprecated(


```json
{
  "learning_journey": {
    "duration": "2 academic years",
    "semesters": [
      {
        "semester": 1,
        "duration": "6 months",
        "courses": [
          {
            "course_title": "Introduction to Computer Science",
            "description": "Covers basic concepts of computing, programming fundamentals, and an introduction to algorithms.",
            "topics": [
              "Basic programming",
              "Data structures",
              "Algorithms"
            ]
          },
          {
            "course_title": "Introduction to Operating Systems",
            "description": "Introduces basic concepts of operating systems, including process management, memory management, and file systems.",
            "topics": [
              "Process management",
              "Memory management",
              "File systems"
            ]
          }
        ]
      },
      {
        "semester": 2,
        "duration": "6 months",
        "courses": [
   

## Example 4:

#### The new example is related to giving as prompt a specific element of the knoledge base and asking to make exercise related to it.

In [12]:
# EXERCISE GENERATION
context = "concurrency primitives in operating systems"
query = f"Test my knowledge about {context}. Propose some exercise, possibly. Return the result in a JSON."
result = conversation_chain({"question": query})
answer = result["answer"]
print(answer)

Certainly! Below are some exercises to test your knowledge about concurrency primitives in operating systems. Each exercise includes a question and a set of possible answers. The correct answer is indicated in the JSON format provided after the exercises.

### Exercises

1. **What is a semaphore used for in an operating system?**
   - A) To allocate memory efficiently.
   - B) To manage access to shared resources by multiple processes.
   - C) To increase the priority of a process.
   - D) To send messages between processes.

2. **Which of the following is NOT a typical use of mutexes?**
   - A) To ensure that only one thread accesses a critical section at a time.
   - B) To prevent deadlock between processes.
   - C) To count the number of times a resource is used.
   - D) To control access to a shared resource like a file or memory segment.

3. **What is a deadlock in the context of operating systems?**
   - A) When the system is locked down for security reasons.
   - B) When two or 

## Example 5:

#### The new example is based on asking the LLM to find extra resources related to the knowledge base.

In [13]:
# ADDITIONAL RESOURCES
query = f"Provide extra resources, possibly searching the internet for them, to help a student learn about the topics of the document. Return the result in a JSON."
result = conversation_chain({"question": query})
answer = result["answer"]
print(answer)

```json
{
  "resources": [
    {
      "title": "Operating Systems: Three Easy Pieces",
      "author": "Remzi H. Arpaci-Dusseau and Andrea C. Arpaci-Dusseau",
      "description": "A comprehensive textbook covering the fundamentals of operating systems, including concurrency and memory management.",
      "url": "http://pages.cs.wisc.edu/~remzi/OSTEP/"
    },
    {
      "title": "Introduction to Operating Systems",
      "platform": "MIT OpenCourseWare",
      "description": "This course teaches the basic operating system abstractions, mechanisms, and their implementations.",
      "url": "https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-828-operating-system-design-and-implementation-fall-2012/"
    },
    {
      "title": "Linux Kernel Teaching",
      "author": "Robert Love",
      "description": "This is a collection of lectures and labs Linux kernel topics. The lectures focus on theoretical and Linux kernel exploration.",
      "url": "https://github.com/

## Example 6:

#### The LLM is asked to create some specific exercises regarding the knowledge base.

In [16]:
# TAKE ONE STEP OF THE LEARNING JOURNEY AND GENERATE AN EXAMPLE EXERCISE
context = "Advanced Topics and Practical Implementation of Operating Systems"
difficulty = "hard"
query = f"Test my knowledge about {context}. Propose some exercise with difficulty {difficulty}. Be very specific in the task description. Return the result in a JSON."
# è vago, chiedi più approfonditamente se vuoi con un altro prompt --> ESTENSIONE: metti i bottoni per chiedere un certo livello di dettaglio e delucidazioni
result = conversation_chain({"question": query})
answer = result["answer"]
exercises = answer
print(answer)

Certainly! Here are some challenging exercises related to advanced topics in operating systems, formatted in JSON:

```json
{
  "exercises": [
    {
      "topic": "Concurrency and Synchronization",
      "description": "Implement a multithreaded program using POSIX threads that simulates the dining philosophers problem. Ensure that your solution avoids both deadlock and starvation. Use mutexes and condition variables to synchronize access to resources (forks).",
      "tasks": [
        "Create a structure to represent each philosopher and fork.",
        "Implement functions for philosopher actions: think, pick up forks, eat, and put down forks.",
        "Ensure proper synchronization to avoid deadlock and starvation.",
        "Test the program with different numbers of philosophers and varying think/eat times to ensure robustness."
      ]
    },
    {
      "topic": "Memory Management",
      "description": "Write a simple memory allocator in C that mimics the behavior of 'malloc