In [8]:
%load_ext autoreload
%autoreload 2

import os
import sys
from pathlib import Path
import dotenv
from llama_index.core.node_parser import SentenceSplitter

sys.path.append('../')

import src.utils as ut
import src.qa_gen as qag

dotenv.load_dotenv(os.environ['HOME'] + '/profile.env', override=True)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [4]:
# Other models to try here:
# https://console.groq.com/docs/models
GROQ_MODEL = "llama-3.1-70b-versatile"
# GROQ_MODEL = "llama-3.1-70b-specdec"

qa_generator = qag.QAGenerator(
                    api_keys_var="GROQ_API_KEYS",
                    prompt_ver="spa-v0",
                    groq_model=GROQ_MODEL,
                    pause_secs=1.1, # No more than 60 reqs per minute.
                    cache_enabled=False,
                    temperature=0.1
                )

Returning secret from environment variable `GROQ_API_KEYS`=`gs...a9`
6 Groq api keys loaded.
QAGenerator.init: prompt_tmpl=
'''
Por favor genera una pregunta de opción múltiple (con una única respuesta correcta y 3 respuestas incorrectas) a partir del fragmento de texto que te doy más abajo.
La pregunta NO DEBE ser sobre números o títulos de artículos o capítulos.

Es decir, el formato que esperamos es:

```
Pregunta: <pregunta aquí, no más de 10 o 15 palabras y termina en signo de interrogación>
Respuesta correcta: <respuesta correcta aquí>
Respuesta incorrecta 1: <respuesta incorrecta 1 aquí>
Respuesta incorrecta 2: <respuesta incorrecta 2 aquí>
Respuesta incorrecta 3: <respuesta incorrecta 3 aquí>
```

El texto a partir del cuál debes generar la pregunta es:
```
{chunk}
```
Pregunta: '''


In [5]:
chunk_eng = """
The corporation shall be dissolved:
For reasons stipulated by law.
By decision of the Superior Council, taken by two-thirds of its ordinary councilors in office, in two sessions of the Superior Council that must take place with an interval of no less than thirty calendar days, when in their judgment the Institution is unable to fulfill its purposes.
"""

chunk_spa="""
La Universidad tendrá un revisor fiscal, designado por el Consejo Superior para períodos de dos años. La función de revisor fiscal deberá ser ejercida por una sociedad profesional del ramo.
"""

generated_qa = qa_generator.gen_question(chunk_spa, verbose=True)



Por favor genera una pregunta de opción múltiple (con una única respuesta correcta y 3 respuestas incorrectas) a partir del fragmento de texto que te doy más abajo.
La pregunta NO DEBE ser sobre números o títulos de artículos o capítulos.

Es decir, el formato que esperamos es:

```
Pregunta: <pregunta aquí, no más de 10 o 15 palabras y termina en signo de interrogación>
Respuesta correcta: <respuesta correcta aquí>
Respuesta incorrecta 1: <respuesta incorrecta 1 aquí>
Respuesta incorrecta 2: <respuesta incorrecta 2 aquí>
Respuesta incorrecta 3: <respuesta incorrecta 3 aquí>
```

El texto a partir del cuál debes generar la pregunta es:
```

La Universidad tendrá un revisor fiscal, designado por el Consejo Superior para períodos de dos años. La función de revisor fiscal deberá ser ejercida por una sociedad profesional del ramo.

```
Pregunta: 

=== GENERATED QUESTION AND ANSWERS ====
Pregunta: ¿Quién designa al revisor fiscal de la Universidad?
Respuesta correcta: El Consejo Superior
R

In [6]:
in_rec = qag.parse_generated_question(generated_qa=generated_qa, match_strs=qa_generator.match_strs)
in_rec

{'question': '¿Quién designa al revisor fiscal de la Universidad?',
 'correct_answer': ' El Consejo Superior',
 'incorrect_answers': ['El Rector de la Universidad',
  'El Ministerio de Educación',
  'La Asamblea General de la Universidad']}

In [9]:
ut.login_to_hf_hub()

from transformers import AutoTokenizer
TOKENIZER = AutoTokenizer.from_pretrained(ut.LLAMA_MODEL_ID)


Returning secret from environment variable `HF_API_KEY`=`hf...Gk`
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/teo/.cache/huggingface/token
Login successful


In [15]:
import src.utils as ut
import json
from llama_index.core import SimpleDirectoryReader, Document
from llama_index.core.schema import TextNode

def get_chunks(documents: list[Document], splitter: SentenceSplitter) -> list[TextNode]:
    text_chunks = []

    for doc_idx, doc in enumerate(documents):
        doc_chunks = splitter.split_text(doc.text)
        print(f"doc: {doc.metadata.file_path},  len(text)={len(doc.text)} n_chunks: {len(doc_chunks)}")
        text_chunks.append(doc_chunks)

    return text_chunks


def generate_qas_files(qa_generator: qag.QAGenerator,
                      input_paths: list[Path],
                      splitter: SentenceSplitter,
                      out_fpath: Path,
                      limit: int = None):
    # lines = input_path.read_text().split("\n")

    documents = SimpleDirectoryReader(input_files=input_paths).load_data()

    log_path = Path("generate_qas_files.log.txt")


    with out_fpath.open("wt") as f_out, log_path.open("wt") as log:
        # for i, (s, e, src_chunk) in enumerate(ut.chunk_generator(lines, start_idx=start_idx, min_chunk_len=300)):
        # print(f"{i:4d} - start:{s:4d} end:{e:4d} len:{len(src_chunk)} "
        #         f"{src_chunk[:45]!r}...{src_chunk[-45:]!r}")
        for doc in documents:
            fpath = Path(doc.metadata["file_path"])
            src_file_key = fpath.name.split('-')[0].lower()

            chunks = splitter.split_text(doc.text)
            print(f"doc: {fpath.name} n_chunks: {len(chunks)}")

            for i, src_chunk in enumerate(chunks):
                if limit and i > limit:
                    print(f"LIMIT ({limit}) REACHED - breaking out ")
                    break

                n_words = len(src_chunk.split(" "))
                generated_qa = qa_generator.gen_question(src_chunk, verbose=False) # , verbose=(i<5))

                log.write(f"\n**************** CHUNK: {i}   n_chars: {len(src_chunk)} n_words: {n_words}\n"
                          f"{src_chunk}\n\n****genreated_qa:\n{generated_qa}\n")

                qa_record = qag.parse_generated_question(generated_qa=generated_qa,
                                                         match_strs=qa_generator.match_strs)
                # example_id = f"{src_file_key}-s{s}-e{e}"
                example_id = f"{src_file_key}-chunk{i}"
                out_rec = qag.enrich_generated_qa(qa_record, src_chunk, example_id)
                print(json.dumps(out_rec), file=f_out)


def show_chunks(input_paths: list[Path], splitter, tokenizer):
    # lines = input_path.read_text().split("\n")

    documents = SimpleDirectoryReader(input_files=input_paths).load_data()

    # for i, (s, e, src_chunk) in enumerate(ut.chunk_generator(lines, start_idx=start_idx, min_chunk_len=300)):
    # print(f"{i:4d} - start:{s:4d} end:{e:4d} len:{len(src_chunk)} "
    #         f"{src_chunk[:45]!r}...{src_chunk[-45:]!r}")
    for doc in documents:
        fpath = Path(doc.metadata['file_path'])
        chunks = splitter.split_text(doc.text)

        n_charss = []
        n_wordss = []
        n_tokenss = []

        for i, src_chunk in enumerate(chunks):
            n_chars = len(src_chunk)
            n_words = len(src_chunk.split(" "))
            n_charss.append(n_chars)
            n_wordss.append(n_words)

            n_tokens = len(tokenizer(src_chunk)['input_ids'])
            n_tokenss.append(n_tokens)

            # if i < 30:
            #    print(f"\n******************* CHUNK {i}:  n_chars: {n_chars} n_words={n_words}\n\t{src_chunk}")

        avg_nchars = sum(n_charss) / len(n_charss)
        avg_nwords = sum(n_wordss) / len(n_wordss)
        avg_ntokens = sum(n_tokenss) / len(n_tokenss)
        print(f"doc: {fpath.name}\n\t\tn_chunks: {len(chunks)}  avg_nchars:{avg_nchars:.0f}  avg_nwords:{avg_nwords:.0f}   avg_ntokens:{avg_ntokens:.0f}\n")

# file_preffix = "Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.preprocessed2"
# file_preffix = "reglamento-maestria-web-2024.preprocessed2"
file_preffix1 = "Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.preprocessed2"
file_preffix2 = "reglamento-maestria-web-2024.preprocessed2"
input_paths =  [ Path(f"../data/{file_preffix1}.txt"), Path(f"../data/{file_preffix2}.txt") ]

chunk_size = 400
chunk_overlap = 360

text_splitter = SentenceSplitter(chunk_size=chunk_size,
                                 chunk_overlap=chunk_overlap)

show_chunks(input_paths, text_splitter, TOKENIZER)

doc: Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.preprocessed2.txt
		n_chunks: 283  avg_nchars:1523  avg_nwords:228   avg_ntokens:386

doc: reglamento-maestria-web-2024.preprocessed2.txt
		n_chunks: 352  avg_nchars:1484  avg_nwords:220   avg_ntokens:383



In [18]:
PROMPT_VER = "spa-v0"
qa_generator = qag.QAGenerator(
    api_keys_var="GROQ_API_KEYS",
    prompt_ver=PROMPT_VER,
    groq_model=GROQ_MODEL,
    pause_secs=1.1, # No more than 60 reqs per minute.
    cache_enabled=False,
    temperature=0.1
)

Returning secret from environment variable `GROQ_API_KEYS`=`gs...a9`
6 Groq api keys loaded.
QAGenerator.init: prompt_tmpl=
'''
Por favor genera una pregunta de opción múltiple (con una única respuesta correcta y 3 respuestas incorrectas) a partir del fragmento de texto que te doy más abajo.
La pregunta NO DEBE ser sobre números o títulos de artículos o capítulos.

Es decir, el formato que esperamos es:

```
Pregunta: <pregunta aquí, no más de 10 o 15 palabras y termina en signo de interrogación>
Respuesta correcta: <respuesta correcta aquí>
Respuesta incorrecta 1: <respuesta incorrecta 1 aquí>
Respuesta incorrecta 2: <respuesta incorrecta 2 aquí>
Respuesta incorrecta 3: <respuesta incorrecta 3 aquí>
```

El texto a partir del cuál debes generar la pregunta es:
```
{chunk}
```
Pregunta: '''


In [19]:
generate_qas_files(
    qa_generator,
    splitter=text_splitter,
    input_paths=[ Path(f"../data/{file_preffix1}.txt"), Path(f"../data/{file_preffix2}.txt") ],
    out_fpath=Path(f"../data/both-files.spanish.{PROMPT_VER}-ch{chunk_size}-ol{chunk_overlap}.qa.jsonl"),
    limit=None,
)


doc: Estatutos-Universidad-de-los-Andes-2020-ratificados-MEN-RQ.preprocessed2.txt n_chunks: 283
doc: reglamento-maestria-web-2024.preprocessed2.txt n_chunks: 352
Rate limit error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-70b-versatile` in organization `org_01j5jtd6hnfm6vdxwjtzjsegay` on : Limit 200000, Used 200001, Requested 659. Please try again in 4m45.465s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}} retrying with new client: 1
Rate limit error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-70b-versatile` in organization `org_01hygc9mqtff6bvxz0jf2g6cs7` on : Limit 200000, Used 200168, Requested 490. Please try again in 4m44.27s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}} retrying with new client: 2


In [None]:
generate_qas_file(
    qa_generator,
    input_path = Path("../data/reglamento-maestria-web-2024.preprocessed2.txt"),
    src_file_key = "maestria",
    start_idx = 130,
    out_fpath = Path("../data/reglamento-maestria-web-2024.preprocessed2.qa.jsonl")
)


687 lines
   0 - start: 130 end: 134 len:372 '\n## DEFINITIONS\n\nUniversidad de los Andes is '...' well as their commitment to the environment.'

Please generate one multiple choice question, a correct answer for it and three (3) incorrect answers, based on the following text:
```

## DEFINITIONS

Universidad de los Andes is an autonomous, independent, and innovative institution that fosters pluralism, tolerance, and respect for ideas; it seeks academic excellence and provides its students with a critical and ethical education to strengthen their awareness of their social and civic responsibilities, as well as their commitment to the environment.
```
The question should be short, i.e. between about 8 and 15 words long and end on question mark, and should be contained in a SINGLE line
DO NOT preface the question with any introductory remark, except "QUESTION:"
The question should not be about chapter/section/article numbers or titles.
You should return the question in the first line, f

In [None]:
keys = [ key for key in qa_generator.cache.keys() if 'When a second call is necessary,' in key]
print(len(keys))
for k in keys:
    del qa_generator.cache[k]

0


In [None]:
generated_qa = """Here's the generated question based on the provided text.

QUESTION: What type of relationships does the institution develop?
CORRECT ANSWER: academic, scientific, artistic
INCORRECT ANSWER 1: Only financial relationships
 INCORRECT ANSWER 2: No relationships whatsoever
INCORRECT ANSWER 3: Sports relationships
"""

qag.parse_generated_question(generated_qa)

{'question': "Here's the generated question based on the provided text. What type of relationships does the institution develop?",
 'correct_answer': 'academic, scientific, artistic',
 'incorrect_answers': ['Only financial relationships',
  'No relationships whatsoever',
  'Sports relationships']}