In [None]:
%%time
from IPython.display import clear_output

! pip install -q -U transformers
! pip install -q -U accelerate
! pip install -q -U bitsandbytes
! pip install -q -U pypdf
! pip install -qq -U langchain

clear_output()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%time

from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")
import gc
import time

import pandas as pd
import matplotlib.pyplot as plt

import torch

### transformers
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

### quantization
import bitsandbytes as bnb

### langchain
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
import langchain

clear_output()

CPU times: user 9.85 s, sys: 1.44 s, total: 11.3 s
Wall time: 22.4 s


In [None]:
print('torch version: ', torch.__version__)
print(f'transformers version: {transformers.__version__}')
print(f'bnb version: {bnb.__version__}')
print(f'langchain version: {langchain.__version__}')

torch version:  2.2.1+cu121
transformers version: 4.39.3
bnb version: 0.43.1
langchain version: 0.1.16


In [None]:
class CFG:
    ### debug
    DEBUG = True
    MAX_CHUNKS = 50 if DEBUG else 1_000_000

    ### synthetic data settings
    MIN_CHUNK_SIZE = 1_000
    PAGES_TO_SKIP = 0

    ### split
    CHUNK_SIZE = 3000
    OVERLAP = 500

    ### model
    MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.1'


In [None]:
%%time

### quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
    llm_int8_enable_fp32_cpu_offload = True,
)

### model
model = AutoModelForCausalLM.from_pretrained(
    CFG.MODEL_ID,
    quantization_config = bnb_config,
    device_map = "auto",
    trust_remote_code = True,
#     attn_implementation = 'flash_attention_2',
)

### tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    CFG.MODEL_ID,
    trust_remote_code = True
)
tokenizer.pad_token = tokenizer.eos_token

### pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    eos_token_id = tokenizer.eos_token_id,
    do_sample = True,
    max_new_tokens = 512,
    temperature = 0.8,
    top_p = 0.90,
    repetition_penalty = 1.2
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 18.7 s, sys: 20.4 s, total: 39.1 s
Wall time: 1min 22s


In [None]:
gc.collect()

81

In [None]:
import spacy
spacy.cli.download("en_core_web_md")

nlp = spacy.load("en_core_web_md")

at = []
# Define the filename
filename = '/content/raw_data.txt'

# Function to chunk the text
def chunk_text(text, chunk_size=1000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks


# Read the text file
with open(filename, 'r', encoding='utf-8') as file:
    text_data = file.read()
j = 0
# Process the text in chunks
chunks = chunk_text(text_data)

In [None]:
prompt_to_generate_question = """

You are a helpful assistant that creates prompts to assess a models knowledge to the given text.

For this text

{text}

For the text, give me prompts to test the models knowledge on it.

give the output in this manner,

'''
{"prompt": {"references": [], "prompt": "", "subtlety": }}
'''

These are some examples of how i want the output

[
    {
        "prompt": {
            "references": [
                "Eowyn",
                "Theoden",
                "Rohan"
            ],
            "prompt": "Eowyn stood by Theoden's side, her heart filled with pride and fear as the warriors of Rohan prepared for battle. Theoden placed a hand on her shoulder and said,",
            "subtlety": 7
        }
    },
    {
        "prompt": {
            "references": [
                "Faramir",
                "Boromir",
                "Gondor"
            ],
            "prompt": "Faramir knelt by Boromir's side, his heart heavy with grief. \"Gondor has lost its greatest warrior,\" he whispered. Boromir's eyes fluttered open and",
            "subtlety": 7
        }
    },
"""


QUESTION_PROMPT = PromptTemplate(
#     template = prompt_to_generate_question,
    template =  '[INST]' + prompt_to_generate_question + '[/INST]',
    input_variables = ["anchor_terms"]
)

question_chain = LLMChain(prompt=QUESTION_PROMPT, llm=llm)

In [None]:
generated_question = question_chain.invoke(
    {
        "text": chunk
    }
)

print(generated_question['text'])

In [None]:
import json

# List to store all generated prompts
all_prompts = []

for i in range(0, len(chunks)):

    # Invoke the model to generate prompts
    generated_prompts = question_chain.invoke({
        "text": chunks[i]
    })

    # Assuming generated_prompts['text'] now directly returns a list of prompt dictionaries
    all_prompts.extend(generated_prompts['text'])

# Convert all prompts to JSON format and save to a file
with open(f'{CFG.OUTPUT_FOLDER}prompts.json', 'w') as f:
    json.dump(all_prompts, f)
