In [1]:
import os
import wget
import PyPDF2
from dotenv import load_dotenv
from langchain.text_splitter import SpacyTextSplitter
from spacy.matcher import PhraseMatcher

import spacy
import re

load_dotenv()

from langchain.agents import load_tools, Tool
from langchain.utilities.google_search import GoogleSearchAPIWrapper
from langchain.agents import initialize_agent
from langchain.llms import OpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from random import sample

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.question_answering import load_qa_chain

from langchain.prompts import PromptTemplate

from langchain.chains import LLMRequestsChain, LLMChain

from langchain.docstore.document import Document


# First, let's load the language model we're going to use to control the agent.
llm = OpenAI(temperature=0)


google_search = GoogleSearchAPIWrapper()




def select_snippets_with_keywords(keyword: str, text: str, window_size: int = 500):
    # Window size is in characters
    all_keyword_indexes = [m.start() for m in re.finditer(f"({keyword}).*[\\.]", text)]
    output = []
    for index in all_keyword_indexes:
        left_window = max(0, index - window_size)
        right_window = min(len(text), index + window_size)
        snippet = text[left_window:right_window]
        output.append(snippet)
    
    return output

def get_keywords(query):
    template = """Select at most 3 most meaningful keywords in lowercase and separated by a comma from this text : {text}"""
    prompt = PromptTemplate(template=template, input_variables=["text"])
    get_keywords_chain = LLMChain(prompt=prompt, llm=OpenAI(temperature=0), verbose=True)
    
    return [k.strip() for k in get_keywords_chain.run(query).split(",")]


def google_search_about_education(query: str) -> str:
    keywords = get_keywords(query)
    print(keywords)

    r = google_search._google_search_results(
        f"{query} filetype:pdf site:eduscol.education.fr"
    )
    if len(r) == 0:
        return "No link"
    else:
        all_docs = []
        embeddings = HuggingFaceEmbeddings()
        vectorstore = None
        text_splitter = SpacyTextSplitter.from_tiktoken_encoder(
            chunk_size=2000, chunk_overlap=0, pipeline="fr_core_news_sm"
        )

        all_relevant_snippets = []

        for result in r[:3]:
            pdf_url = result["link"]  # Take the URL of the first result

            # Download file
            wget.download(url=pdf_url, out="data/result.pdf")
            document_text = ""
            with open("data/result.pdf", "rb") as pdf_file:
                # creating a pdf reader object
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                for page in pdf_reader.pages:
                    # extracting text from page
                    document_text += page.extract_text()
            os.remove("data/result.pdf")

            relevant_snippets = []

            snippets_with_keyword = []
            for keyword in keywords:
                snippets_with_keyword = select_snippets_with_keywords(keyword, document_text, window_size=500)
            snippets_with_keyword = text_splitter.split_text("\n".join(snippets_with_keyword))

            min_snippets = 1
            max_snippets = 3
            max_vector_store = 50

            if len(snippets_with_keyword) > max_snippets:
                if len(snippets_with_keyword) > max_vector_store:
                    snippets_with_keyword = sample(snippets_with_keyword, max_vector_store)
                vectorstore = FAISS.from_texts(snippets_with_keyword, embeddings)
                relevant_snippets = vectorstore.similarity_search(query, max_snippets)
            if len(snippets_with_keyword) < min_snippets: 
                all_snippets = text_splitter.split_text(document_text)
                random_snippets = sample(all_snippets, max_vector_store - len(snippets_with_keyword))
                vectorstore = FAISS.from_texts(snippets_with_keyword, embeddings)
                relevant_snippets = text_splitter.create_documents(snippets_with_keyword) 
                relevant_snippets += vectorstore.similarity_search(query, max_snippets - len(snippets_with_keyword))

            all_relevant_snippets += relevant_snippets

        # Select the most relevant snippets from the collection
        vectorstore = FAISS.from_documents(all_relevant_snippets, embeddings)
        docs = vectorstore.similarity_search(query, k=3)
        # Reply to the question
        chain = load_qa_chain(llm, chain_type="stuff", verbose=True)
        summary = chain.run({"question": query, "docs": docs})
        return summary


google_search_education = Tool(
    "Google Search about education",
    google_search_about_education,
    "A wrapper around Google Search, that returns only extracts of pdf about Education from the French government. Useful for when you need to answer questions about teaching. Input should be a short question in French.",
)

tools = [google_search_education]


# Finally, let's initialize an agent with the tools, the language model, and the type of agent we want to use.
agent = initialize_agent(tools, llm, agent="zero-shot-react-description", verbose=True)

agent.run("quelle activité organiser pour favoriser apprentissage trandisciplinaire ?")


  from .autonotebook import tqdm as notebook_tqdm




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I should look for activities that promote interdisciplinary learning
Action: Google Search about education
Action Input: activities to promote interdisciplinary learning[0m

[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSelect at most 3 most meaningful keywords separated by a comma from this text : activities to promote interdisciplinary learning[0m

[1m> Finished chain.[0m
['Interdisciplinary', 'Learning', 'Activities']


ValueError: Sample larger than population or is negative

In [None]:
from 

In [36]:
agent.run("autonomie en CE1")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find out what autonomy means in CE1
Action: Google Search about education
Action Input: autonomie CE1[0m

Created a chunk of size 311, which is longer than the specified 200
Created a chunk of size 208, which is longer than the specified 200
Created a chunk of size 575, which is longer than the specified 200
Created a chunk of size 339, which is longer than the specified 200
Created a chunk of size 259, which is longer than the specified 200
Created a chunk of size 506, which is longer than the specified 200
Created a chunk of size 283, which is longer than the specified 200
Created a chunk of size 236, which is longer than the specified 200
Created a chunk of size 265, which is longer than the specified 200
Created a chunk of size 210, which is longer than the specified 200
Created a chunk of size 296, which is longer than the specified 200
Created a chunk of size 244, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 285, which is longer than the specified 200
Created a chunk of size 327, which is longer tha


Observation: [36;1m[1;3mSi le travail en autonomie permet une meilleure prise en compte de l’hétérogénéité des élèves, il requiert 
la vigilance du professeur.

Il n’est pas envisageable que des élèves de cours préparatoire soient livrés à eux-mêmes plus d’une quinzaine de minutes.

Le professeur ne peut s’isoler avec un groupe pendant un trop long moment, laissant les autres élèves, inoccupés ou bloqués par les obstacles rencontrés.


Certaines tâches ne peuvent être réalisées sans l’aide du professeur. | Les automatismes en écriture (copie, dictée, rédaction) s’obtiennent en privilégiant 
la répétition au  travers d’entraînements très réguliers, notamment lors d’activités 
ritualisées (dictée quotidienne, autodictée, phrase du jour, « jogging d’écriture »

).




Toute séance portant sur une nouvelle notion ne doit pas faire l’économie, en guise 
d’introduction, du brassage des acquis. | Le recours à cette 
classification doit devenir automatique pour amener les élèves à réfléchir

"Pour permettre aux élèves de CE1 d'atteindre l'autonomie, le professeur doit leur offrir un environnement adapté et structuré, avec des activités variées et des entraînements réguliers qui alternent des temps dirigés et des temps de travail en autonomie. Des outils d'aide et des jeux pédagogiques peuvent également être mis en place pour faciliter l'apprentissage."

In [1]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_text(state_of_the_union)

  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|██████████| 1.02k/1.02k [00:00<00:00, 630kB/s]
Downloading:  38%|███▊      | 424M/1.11G [12:00<23:39, 485kB/s]     

KeyboardInterrupt: 

Downloading:  38%|███▊      | 425M/1.11G [12:17<23:39, 485kB/s]