In [1]:
import re
import os
import json
import torch
import spacy
import string

from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
%%capture
! spacy download en_core_web_sm
! spacy download zh_core_web_sm
! spacy download nl_core_news_sm
! spacy download fr_core_news_sm
! spacy download de_core_news_sm
! spacy download ja_core_news_sm
! spacy download pl_core_news_sm
! spacy download ru_core_news_sm
! spacy download es_core_news_sm

In [3]:
def generate_question_answer_pairs(
        paths ,
        languages = ['en' , 'zh' , 'nl' , 'fr' , 'de' , 'ja' , 'pl' , 'ru' , 'es'] ,
        ner_chunk_length = 10240 ,
        model_chunk_length = 512 ,
        max_chunk_input_length = 300 ,
        max_chunk_output_length = 400 ,
        punctuations = ['.' , ',' , ' '] ,

) :

    prompt = '''

    <|user|>
    Instruct :
    * You will be given a passage of text as input.
    * Your task is to generate one question-answer pair based on the information in the passage.
    * Only ask direct subject question, not true and False or Multiple Choice
    * Focus on factual questions that can be directly answered from the provided text.
    * Keep the answers accurate and informative, directly addressing the corresponding question.
    * The question should not be based on names, emails etc
    * Do not ask vague questions like What is the context of provided text
    * Only return the question answer pair and nothing else

    Passage : {}

    <|end|>
    <|assistant|>
    '''

    removal_prompt = '''<s>

    <|user|> Instruct :
    * You will be given a passage of text as input.
    * Your task is to generate one question-answer pair based on the information in the passage.
    * Only ask direct subject question, not true and False or Multiple Choice
    * Focus on factual questions that can be directly answered from the provided text.
    * Keep the answers accurate and informative, directly addressing the corresponding question.
    * The question should not be based on names, emails etc
    * Do not ask vague questions like What is the context of provided text
    * Only return the question answer pair and nothing else

    Passage :'''# Do not change

    start_inst = '<|end|><|assistant|>'
    end_inst = '<|end|>'

    torch.set_default_device("cuda")
    letters = list(string.ascii_letters)
    digits = list(string.digits)
    usefull_characs = ''.join(list(letters + digits + punctuations))
    usefull_pattern = f"[^{re.escape(usefull_characs)}]"
    link_pattern = r"(https?:\/\/)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"

    ner_model_wrapper = {
        'en' : spacy.load('en_core_web_sm') , # English
        'zh' : spacy.load('zh_core_web_sm') , # Chinese
        'nl' : spacy.load('nl_core_news_sm') , # Dutch
        'fr' : spacy.load('fr_core_news_sm') , # French
        'de' : spacy.load('de_core_news_sm') , # German
        'ja' : spacy.load('ja_core_news_sm') , # Japenese
        'pl' : spacy.load('pl_core_news_sm') , # Polish
        'ru' : spacy.load('ru_core_news_sm') , # Russian
        'es' : spacy.load('es_core_news_sm') # Spanish
    }

    ner_models = [
        ner_model_wrapper[language]
        for language
        in languages
    ]

    model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct", torch_dtype="auto", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
    tokenizer.pad_token_id = tokenizer.eos_token_id

    text = ''


    for path in paths :

        if path.endswith('txt') : text += open(path).read()

    chunks = [
        text[index : index + ner_chunk_length] # Limit is around 49K bytes
        for index
        in range(0 , len(text) , ner_chunk_length)
    ]

    entities = []

    for ner_model in ner_models : # Takes a little bit of time and RAM for this around 200MB and around 20 seconds

        for chunk in tqdm(chunks , total = len(chunks)) :

            ents = ner_model(chunk).ents

            for ent in ents :

                if ent.label_ == 'PERSON' : entities.append(str(ent))

    for entity in entities : text = text.replace(entity , '')

    text = re.sub(link_pattern, "", text)
    text = re.sub(usefull_pattern, "", text)

    chunks = [
        text[index : index + model_chunk_length]
        for index
        in range(0 , len(text) , model_chunk_length)
    ][: 7000]

    questions = []

    for chunk in tqdm(chunks , total = len(chunks) , desc = 'Getting Question Answer Pairs') :

        text = prompt.format(chunk)

        inputs = tokenizer(
            text ,
            return_tensors = 'pt' ,
            return_attention_mask = True
        )

        if inputs['input_ids'].shape[1] > max_chunk_input_length : pass
        else :

            outputs = model.generate(**inputs, max_length=max_chunk_output_length)
            out = tokenizer.batch_decode(outputs)[0]

            out = out.replace(removal_prompt , '')
            out = out.replace(chunk , '')
            out = out.replace(start_inst , '')
            out = out.replace(end_inst , '')
            out = out.replace('Question: ' , '')
            out = out.replace('Answer: ' , '')

            for charac in out :

                if charac == '\n' or charac == ' ' : out = out[1 :]
                else : break

            questions.append(out)

    with open('question_answer_pairs.jsonl', 'w') as f :

        for question in questions:

            json_dict = {'text': question}
            f.write(json.dumps(json_dict) + '\n')

In [4]:
generate_question_answer_pairs(['/kaggle/input/dsjfkvjsdxcv/text.txt'] , languages = ['en'])

config.json:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 573/573 [02:41<00:00,  3.56it/s]
Getting Question Answer Pairs: 100%|██████████| 7000/7000 [8:27:49<00:00,  4.35s/it]
