In [None]:
%%time
from IPython.display import clear_output

! pip install -q -U transformers
! pip install -q -U accelerate
! pip install -q -U bitsandbytes
! pip install -q -U pypdf
! pip install -qq -U langchain

clear_output()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%time

from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")
import gc
import time

import pandas as pd
import matplotlib.pyplot as plt

import torch

### transformers
import transformers
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)

### quantization
import bitsandbytes as bnb

### langchain
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
import langchain

clear_output()

CPU times: user 9.85 s, sys: 1.44 s, total: 11.3 s
Wall time: 22.4 s


In [None]:
print('torch version: ', torch.__version__)
print(f'transformers version: {transformers.__version__}')
print(f'bnb version: {bnb.__version__}')
print(f'langchain version: {langchain.__version__}')

torch version:  2.2.1+cu121
transformers version: 4.39.3
bnb version: 0.43.1
langchain version: 0.1.16


In [None]:
class CFG:
    ### debug
    DEBUG = True
    MAX_CHUNKS = 50 if DEBUG else 1_000_000

    ### synthetic data settings
    MIN_CHUNK_SIZE = 1_000
    PAGES_TO_SKIP = 0

    ### split
    CHUNK_SIZE = 3000
    OVERLAP = 500

    ### model
    MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.1'


In [None]:
%%time

### quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = True,
    llm_int8_enable_fp32_cpu_offload = True,
)

### model
model = AutoModelForCausalLM.from_pretrained(
    CFG.MODEL_ID,
    quantization_config = bnb_config,
    device_map = "auto",
    trust_remote_code = True,
#     attn_implementation = 'flash_attention_2',
)

### tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    CFG.MODEL_ID,
    trust_remote_code = True
)
tokenizer.pad_token = tokenizer.eos_token

### pipeline
pipe = pipeline(
    task = "text-generation",
    model = model,
    tokenizer = tokenizer,
    eos_token_id = tokenizer.eos_token_id,
    do_sample = True,
    max_new_tokens = 512,
    temperature = 0.8,
    top_p = 0.90,
    repetition_penalty = 1.2
)

### langchain pipeline
llm = HuggingFacePipeline(pipeline = pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 18.7 s, sys: 20.4 s, total: 39.1 s
Wall time: 1min 22s


In [None]:
gc.collect()

81

In [None]:
import spacy
spacy.cli.download("en_core_web_md")

nlp = spacy.load("en_core_web_md")

at = []
# Define the filename
filename = '/content/raw_data.txt'

# Function to chunk the text
def chunk_text(text, chunk_size=1000):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunks.append(text[i:i+chunk_size])
    return chunks


# Read the text file
with open(filename, 'r', encoding='utf-8') as file:
    text_data = file.read()
j = 0
# Process the text in chunks
for chunk in chunk_text(text_data):
    a = len(at)
    doc = nlp(chunk)
    # Extract unique expressions, names, entities, and locations
    anchor_terms = list(set([ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE', 'LOC']]))

    # Remove terms from anchor_terms if they are in at
    anchor_terms = [term for term in anchor_terms if term not in at]

    # Add remaining terms to at
    at.extend(anchor_terms)

    j = j + (len(at)-a)
    print(j)

In [None]:
at

['the Order of the Phoenix\nJ K Rowling\n\n- CHAPTER',
 'Dudley Demented',
 'hosepipes',
 'Harry Potter',
 'Harry',
 'Petunia',
 'Harry Potters',
 'bush',
 'Vernon',
 'Vernon Dursley',
 'Dudley',
 'Dursleys',
 'Aunt Petunia',
 'Mrs Figg',
 'Polkisses',
 'Dudders',
 'Uncle Vernon',
 'Bungy',
 'Barnsley',
 'Mary Dorkins',
 'Surrey',
 'Southeast',
 'china',
 'ontinued',
 "Uncle Vernon's",
 'Apparating',
 'Dobby',
 'Aunt Petunia.',
 'Hermione',
 'Ron',
 'Voldemort',
 'Honeydukes',
 'Burrow',
 'Sirius',
 'Cedric',
 'Magnolia Crescent',
 'Hippogriff',
 'Azkaban',
 'Privet Drive',
 'Dumbledore',
 'the Junior Heavyweight Inter-school Boxing Champion of the Southeast',
 'Dudley Dursley',
 "St Brutus's",
 'Potter',
 'Centre for Incurably Criminal Boys',
 'Malcolm',
 'Gordon',
 'Piers',
 'SHUT IT',
 'Ickle Diddykins',
 'Dinky Diddydums',
 'Mark Evans',
 'Wisteria Walk',
 'Dud',
 'Diddykins',
 'shudderi',
 'Dad',
 'Point',
 'Mum',
 'Boo hoo',
 'goose bumps',
 'KEEP',
 "YOU'RE",
 'DUDLEY',
 'Dement

In [None]:
prompt_to_generate_question = """

You are a helpful assistant that creates a alternative terms for the given terms. Try to keep the number of words in the alternative text the same as the anchor terms.

For these anchor terms:

{anchor_terms}

For each term, provide an alternative terms using your best judgment. do not give code.

give the output as a python dictionary in this manner,

'''
  "unique_term_1": "generic_translation_1",
  "unique_term_2": "generic_translation_2",
  ...
'''

Theses are some examples of how i want the output

"Harry Potter": "John Smith",
"Dursleys": "Smith family",
"Dudley Dursley": "Tom Johnson",
"Aunt Petunia": "Aunt Mary"

Question:
"""


QUESTION_PROMPT = PromptTemplate(
#     template = prompt_to_generate_question,
    template =  '[INST]' + prompt_to_generate_question + '[/INST]',
    input_variables = ["anchor_terms"]
)

question_chain = LLMChain(prompt=QUESTION_PROMPT, llm=llm)

In [None]:
chunk = at[30:45]

generated_question = question_chain.invoke(
    {
        "anchor_terms": chunk
    }
)

print(generated_question['text'])


[INST]

You are a helpful assistant that creates a alternative terms for the given terms. Try to keep the number of words in the alternative text the same as the anchor terms.

For these anchor terms:

['Hermione', 'Honeydukes', 'Burrow', 'Cedric', 'Sirius', 'Magnolia Crescent', 'Azkaban', 'Hippogriff', 'Privet Drive', 'Dumbledore', 'the Junior Heavyweight Inter-school Boxing Champion of the Southeast', 'Dudley Dursley', 'Centre for Incurably Criminal Boys', "St Brutus's", 'Potter']

For each term, provide an alternative terms using your best judgment. do not give code.

give the output as a python dictionary in this manner,

'''
  "unique_term_1": "generic_translation_1",
  "unique_term_2": "generic_translation_2",
  ...
'''

Theses are some examples of how i want the output

"Harry Potter": "John Smith",
"Dursleys": "Smith family",
"Dudley Dursley": "Tom Johnson",
"Aunt Petunia": "Aunt Mary"

Question:
[/INST] Sure! Here's my attempt at providing alternative terms for each of the giv

In [None]:
print(generated_question)

{'anchor_terms': ['Hermione', 'Honeydukes', 'Burrow', 'Cedric', 'Sirius', 'Magnolia Crescent', 'Azkaban', 'Hippogriff', 'Privet Drive', 'Dumbledore', 'the Junior Heavyweight Inter-school Boxing Champion of the Southeast', 'Dudley Dursley', 'Centre for Incurably Criminal Boys', "St Brutus's", 'Potter', 'Piers', 'Gordon', 'Malcolm', 'SHUT IT', 'Ickle Diddykins'], 'text': '[INST]\n\nYou are a helpful assistant that creates a alternative terms for the given terms. Try to keep the number of words in the alternative text the same as the anchor terms.\n\nFor these anchor terms:\n\n[\'Hermione\', \'Honeydukes\', \'Burrow\', \'Cedric\', \'Sirius\', \'Magnolia Crescent\', \'Azkaban\', \'Hippogriff\', \'Privet Drive\', \'Dumbledore\', \'the Junior Heavyweight Inter-school Boxing Champion of the Southeast\', \'Dudley Dursley\', \'Centre for Incurably Criminal Boys\', "St Brutus\'s", \'Potter\', \'Piers\', \'Gordon\', \'Malcolm\', \'SHUT IT\', \'Ickle Diddykins\']\n\nFor each term, provide an alter

In [None]:
import spacy
from collections import defaultdict
import re

df = pd.DataFrame(columns=['Terms'])

for i in range(0, len(at), 15):
  batch_anchor_terms = at[i:i+15]

  generated_question = question_chain.invoke(
      {
          "anchor_terms": batch_anchor_terms
      }
  )

  pattern = r"\{(?:\s*\".+?\"\s*:\s*\".+?\"\s*,?)+\}"
  print(generated_question['text'])
  # Find the dictionary in the text using regex
  matches = re.search(pattern, generated_question['text'])

  # Extract the matched dictionary
  if matches:
      dictionary_text = matches.group()
      # Convert the string representation of dictionary to an actual dictionary
      extracted_dict = eval(dictionary_text)
      print(extracted_dict)
  else:
      extracted_dict = "Dictionary not found in the text."
      print("Dictionary not found in the text.")

  ### store in df
  new_data_point = {
      'Terms': extracted_dict
  }

  df = pd.concat([df, pd.DataFrame([new_data_point])], ignore_index=True)

### persist synthetic data
df.to_csv(f'{CFG.OUTPUT_FOLDER}synth_raw.csv', index=False)

dict_list = df['Terms'].tolist()

# Save the list of dictionaries as a .npy file
np.save('extracted_dicts.npy', dict_list)

#clear_output()

In [None]:
df

Unnamed: 0,Terms,Book,Page,Seconds
0,"{'Neil': 'John', 'Mary Grandpré': 'Mary', 'War...",Book 5.pdf,7,17.64
1,Dictionary not found in the text.,Book 5.pdf,16,9.46
2,Dictionary not found in the text.,Book 5.pdf,17,14.7
3,Dictionary not found in the text.,Book 5.pdf,18,7.07
4,Dictionary not found in the text.,Book 5.pdf,19,8.97
5,Dictionary not found in the text.,Book 5.pdf,20,29.6
6,Dictionary not found in the text.,Book 5.pdf,21,16.23
7,Dictionary not found in the text.,Book 5.pdf,22,9.27
8,Dictionary not found in the text.,Book 5.pdf,23,10.57
9,Dictionary not found in the text.,Book 5.pdf,24,6.33
