In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.docstore.document import Document
import requests

In [2]:
import os
os.environ["OPENAI_API_KEY"] = "..."

In [3]:
from langchain.text_splitter import CharacterTextSplitter

In [4]:
chain = load_qa_with_sources_chain(OpenAI(temperature=0))

def print_answer(question):
    print(
        chain(
            {
                "input_documents": search_index.similarity_search(question, k=4),
                "question": question,
            },
            return_only_outputs=True,
        )["output_text"]
    )

In [5]:
import pathlib
import subprocess
import tempfile

In [6]:
def get_github_docs(repo_owner, repo_name):
    with tempfile.TemporaryDirectory() as d:
        subprocess.check_call(
            f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
            cwd=d,
            shell=True,
        )
        git_sha = (
            subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d)
            .decode("utf-8")
            .strip()
        )
        repo_path = pathlib.Path(d)
        markdown_files = list(repo_path.glob("*/*.md")) + list(
            repo_path.glob("*/*.mdx")
        )
        for markdown_file in markdown_files:
            with open(markdown_file, "r") as f:
                relative_path = markdown_file.relative_to(repo_path)
                github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
                yield Document(page_content=f.read(), metadata={"source": github_url})

In [7]:
repo_owner = "hyperledger"
repo_name = "aries-cloudagent-python"

documents = list(get_github_docs(repo_owner, repo_name))


Cloning into '.'...


In [8]:
texts = [doc.page_content for doc in documents]

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

In [12]:
from dataclasses import dataclass, field

@dataclass
class Document:
    page_content: str
    metadata: dict = field(default_factory=dict)

# Assuming `texts` is a list of strings from GitHub repo
documents = [Document(page_content=text) for text in texts]
split_texts = text_splitter.split_documents(documents)


In [13]:
print (f'Now you have {len(split_texts)} documents')

Now you have 170 documents


In [14]:
# print(split_texts)

In [15]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [16]:
OPENAI_API_KEY = '...'
PINECONE_API_KEY = '...'
PINECONE_API_ENV = 'us-east4-gcp'

In [17]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [19]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain"

In [20]:
docsearch = Pinecone.from_texts([t.page_content for t in split_texts], embeddings, index_name=index_name)

In [21]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [22]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [23]:
query = "Could you please explain me about model validation?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [24]:
chain.run(input_documents=docs, question=query)

' Model validation is the process of assessing how well a model fits the observed data. It is used to answer the question “How well did my hypothesis fit the observed data?” To validate a model, the data is usually split into training, testing and validation sets. The model is constructed with the training data and then evaluated with the testing data. The performance of the model against the testing set is used to further reduce model error. Finally, the model is evaluated on the validation data to assess how well the model generalizes. Statistical methods such as calculating the coefficient of determination, commonly called the R-squared value, are also used to validate models.'

In [35]:
query = "Can you summarize the raw data?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [36]:
chain.run(input_documents=docs, question=query)

' No, raw data cannot be summarized. It must first be collected or processed in order to assess its truthfulness and accuracy, identify missing or incomplete information, and re-sample from the population to conduct a statistical comparison. Once the data has been collected or processed, it can then be summarized accurately so the decision maker can accurately interpret and compare the data.'