In [1]:
from langchain.document_loaders import DirectoryLoader
from transformers import AutoTokenizer
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter,CharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFaceHub
from langchain import LLMChain
import os
from tqdm.notebook import tqdm
import uuid
from uuid import uuid4
from langchain.chat_models import ChatAnthropic
from langchain.schema import HumanMessage
import re
import json

In [2]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = ''

In [3]:
doc_path="/Users/lichenghu/Desktop/DSC-291-temp"

In [4]:
def load_docs(doc_path):
    docs = DirectoryLoader(doc_path, glob="*.txt", show_progress=True, use_multithreading=True,max_concurrency=16).load()
    tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-large", max_length=512,truncation=True)
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                                        tokenizer,
                                        chunk_size=300, 
                                        chunk_overlap=0, 
                                        separators=[". "],
                                        keep_separator=False) ### hyperparams  
    splitted_documents = text_splitter.split_documents(docs)

    content={}
    for doc in splitted_documents:
        content[str(uuid.uuid4())]= doc.page_content
    return content

In [5]:
document=load_docs(doc_path)

100%|███████████████████████████████████████████| 39/39 [00:13<00:00,  2.85it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors


In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [7]:
serie = pd.Series(document)

In [8]:
training_data , test_data  = [i.to_dict() for i in train_test_split(serie, train_size=0.7)]

In [9]:
len(training_data),len(test_data)

(3455, 1482)

In [10]:
training_data

{'8990730e-f6b0-4831-8ac6-6b5510ee0f93': 'There Section 10.1.4 is, however, one subtlety that needs to be addressed.\n\nFor any given setting of the parameters in a Gaussian mixture model (except for speciﬁc degenerate settings), there will exist other parameter settings for which the density over the observed variables will be identical. These parameter values differ only through a re-labelling of the components. For instance, consider a mixture of two Gaussians and a single observed variable x, in which the parameters have the values π1 = a, π2 = b, µ1 = c, µ2 = d, σ1 = e, σ2 = f. Then the parameter values π1 = b, π2 = a, µ1 = d, µ2 = c, σ1 = f, σ2 = e, in which the two components have been exchanged, will by symmetry give rise to the same value of p(x). If we have a mixture model comprising K components, then each parameter setting will be a member of a family of K! equivalent settings',
 '7c6c931f-1736-4ca8-b94f-b0cc2c037656': 'Chapter 19  Approximate Inference  Many probabilistic 

In [11]:
with open("/Users/lichenghu/Desktop/embedding_finetune/training_data.json", 'w+') as f:
    json.dump(training_data, f)

with open("/Users/lichenghu/Desktop/embedding_finetune/test_data.json", 'w+') as f:
    json.dump(test_data, f)

In [12]:
os.environ["ANTHROPIC_API_KEY"]=""

In [13]:
series_of_training_data=pd.Series(training_data)

In [14]:
train_data_portion_1=series_of_training_data[:1000].to_dict()

In [15]:
train_data_portion_2=series_of_training_data[1000:2000].to_dict()

In [16]:
train_data_portion_3=series_of_training_data[2000:3000].to_dict()

In [17]:
train_data_portion_4=series_of_training_data[3000:].to_dict()

In [18]:
len(train_data_portion_1)+len(train_data_portion_2)+len(train_data_portion_3)+len(train_data_portion_4)==len(training_data)

True

In [124]:
def generate_queries(
    docs,
    num_questions_per_chunk=2,
):
    """
    Automatically generate hypothetical questions that could be answered with
    doc in the corpus.
    """
    chat = ChatAnthropic(model='claude-2')


    queries = {}
    relevant_docs = {}
    for doc_id, text in tqdm(docs.items()):
        context_str=text
        massage= [HumanMessage(
        content=(
            f"""\
                    Context information is below.

                    ---------------------
                    {context_str}
                    ---------------------

                    Given the context information and not prior knowledge.generate only questions based on the below query.

                    You are a Teacher/ Professor. Your task is to setup \
                    {num_questions_per_chunk} questions for an upcoming \
                    quiz/examination. The questions should be diverse in nature \
                    across the document. Restrict the questions to the \
                    context information provided."
                    """
                )
        )]
        response = chat(massage)
 
        result = response.content.strip().split("\n")
        question = [
            re.sub(r'^[0-9a-zA-Z]+\)?.?', "", question).strip() for question in result
        ]
#         print(question)
#         print("----------")
        questions = [q for q in question[1:] if len(q) > 0]
        
        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [doc_id]
    return queries, relevant_docs

In [101]:
os.environ['TOKENIZERS_PARALLELISM']="false"

In [102]:
train_queries_portion_1, train_relevant_docs_portion_1 = generate_queries(train_data_portion_1)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [126]:
train_queries_portion_2, train_relevant_docs_portion_2 = generate_queries(train_data_portion_2)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [127]:
train_queries_portion_2

{'d7050b0e-3f20-45d0-ae4e-9aca5aa278f8': 'According to the passage, what is the philosophy behind deep learning?',
 '45d5af18-19fd-4725-ac6e-08fbf59ae87a': 'The passage mentions that deep learning has been used to solve problems in areas like computer vision and speech recognition. Can you name one more application area where deep learning has been applied?',
 '9b4a36c0-6cd6-439b-98f0-c9cf5b0a2b63': 'How does the approximate value function simplify computation compared to a standard weighted sum, given that each component is either 0 or 1?',
 '11fb9594-9c72-4c71-ba8b-d914e930975d': 'What causes the qualitatively different generalization patterns shown in the upper half of Figure 9.11?',
 '3c708ae3-1f86-41a4-a551-5338e15278b1': 'What paper by Sutton proposed an incremental version of the delta-bar delta algorithm for adapting bias by gradient descent?',
 '0eefc08b-8124-409a-bd12-13b28f41cb03': 'In what publication did Sutton discuss modeling the world at different timescales using TD mo

In [129]:
train_queries_portion_3, train_relevant_docs_portion_3 = generate_queries(train_data_portion_3)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [130]:
train_queries_portion_4, train_relevant_docs_portion_4 = generate_queries(train_data_portion_4)

  0%|          | 0/455 [00:00<?, ?it/s]

In [131]:
test_queries,test_relevant_docs=generate_queries(test_data)

  0%|          | 0/1482 [00:00<?, ?it/s]

In [138]:
train_queries={**train_queries_portion_1,**train_queries_portion_2,**train_queries_portion_3,**train_queries_portion_4}

In [140]:
len(train_queries)==len(train_queries_portion_1)+len(train_queries_portion_2)+len(train_queries_portion_3)+len(train_queries_portion_4)

True

In [141]:
train_relevant_docs={**train_relevant_docs_portion_1,**train_relevant_docs_portion_2,**train_relevant_docs_portion_3,**train_relevant_docs_portion_4}

In [142]:
len(train_relevant_docs)==len(train_relevant_docs_portion_1)+len(train_relevant_docs_portion_2)+len(train_relevant_docs_portion_3)+len(train_relevant_docs_portion_4)

True

In [143]:
with open("/Users/lichenghu/Desktop/embedding_finetune/train_queries.json", 'w+') as f:
    json.dump(train_queries, f)

with open("/Users/lichenghu/Desktop/embedding_finetune/train_relevant_docs.json", 'w+') as f:
    json.dump(train_relevant_docs, f)

with open("/Users/lichenghu/Desktop/embedding_finetune/val_queries.json", 'w+') as f:
    json.dump(test_queries, f)

with open("/Users/lichenghu/Desktop/embedding_finetune/val_relevant_docs.json", 'w+') as f:
    json.dump(test_relevant_docs, f)

In [149]:
train_dataset = {
    'queries': train_queries,
    'docs': training_data,
    'relevant_docs': train_relevant_docs,
}

val_dataset = {
    'queries': test_queries,
    'docs': test_data,
    'relevant_docs': test_relevant_docs,
}

In [150]:
with open("/Users/lichenghu/Desktop/embedding_finetune/train_dataset.json", 'w+') as f:
    json.dump(train_dataset, f)

with open("/Users/lichenghu/Desktop/embedding_finetune/val_dataset.json", 'w+') as f:
    json.dump(val_dataset, f)