In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

## Basic app for QA code library or github repository

### Load github repo 'The Fuzz' (small python module for string matching)
- [PyPI](https://pypi.org/project/thefuzz/)
- [GitHub](https://github.com/seatgeek/thefuzz)

In [2]:
root_dir = "data/thefuzz-master"

In [3]:
document_chunks = []

In [4]:
from langchain.document_loaders import TextLoader

In [5]:
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try:
            loader = TextLoader(
                os.path.join(dirpath, file),
                encoding="utf-8"
            )
            document_chunks.extend(loader.load_and_split())
        except Exception as e:
            pass

In [6]:
print(f"We have {len(document_chunks)} chunks.")

We have 170 chunks.


In [7]:
print(document_chunks[0].page_content[:300])

import unittest
import re
import pycodestyle

from thefuzz import fuzz
from thefuzz import process
from thefuzz import utils

scorers = [
    fuzz.ratio,
    fuzz.partial_ratio,
    fuzz.token_sort_ratio,
    fuzz.token_set_ratio,
    fuzz.partial_token_sort_ratio,
    fuzz.partial_token_set_ratio,



### Convert text chunks in embeddings and store them in vector database

In [8]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

In [9]:
embeddings = OpenAIEmbeddings()

  warn_deprecated(


In [10]:
stored_embeddings = FAISS.from_documents(document_chunks, embeddings)

### Create RetrievalQA chain

In [11]:
from langchain_openai import ChatOpenAI

In [12]:
chat_model = ChatOpenAI()

In [13]:
from langchain.chains import RetrievalQA

In [14]:
qa_chain = RetrievalQA.from_chain_type(
    llm=chat_model,
    chain_type="stuff",
    retriever=stored_embeddings.as_retriever()
)

### Now we can make questions about github library

In [15]:
question = """
What function do I use if I want to find 
the most similar item in a list of items?
"""

In [16]:
answer = qa_chain.run(question)

  warn_deprecated(


In [17]:
print(answer)

You can use the `process.extractOne()` function from the `thefuzz` library to find the most similar item in a list of items. This function takes a query string and a list of choices, and it returns a tuple containing the best match and its similarity score. Here's an example of how to use it:

```python
from thefuzz import process

choices = ["apple", "banana", "orange"]
query = "aple"

best_match = process.extractOne(query, choices)
print(best_match)
```

Output:
```
('apple', 80)
```

In this example, the best match for the query "aple" is "apple" with a similarity score of 80.


## GitHub Repository Q&A System with DeepLake Vector Store
Complete example that loads data from a GitHub repository into a DeepLake vector store and allows you to ask questions about it.

In [None]:
# !pip install langchain langchain-openai langchain-community langchain-core python-dotenv gitpython deeplake

In [None]:
import os
import tempfile
from git import Repo
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import DeepLake
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain.chains.retrieval_qa.base import RetrievalQA

# Load environment variables from .env file
load_dotenv()

# Check if OpenAI API key is available
if "OPENAI_API_KEY" not in os.environ:
    raise ValueError("Please set OPENAI_API_KEY in your environment variables or .env file")

# Function to clone a GitHub repository and load its contents
def load_github_repo(repo_url, branch="main", file_extensions=[".py", ".js", ".md", ".txt"]):
    """
    Clone a GitHub repository and load its contents as documents.
    
    Args:
        repo_url: URL of the GitHub repository
        branch: Branch to clone (default: main)
        file_extensions: List of file extensions to load
    
    Returns:
        List of Document objects
    """
    print(f"Cloning repository: {repo_url}, branch: {branch}")
    
    # Create a temporary directory for the repo
    with tempfile.TemporaryDirectory() as temp_dir:
        # Clone the repository
        repo = Repo.clone_from(repo_url, temp_dir, branch=branch)
        
        # Set up the loader for code files
        loader = GenericLoader.from_filesystem(
            temp_dir,
            glob="**/*",
            suffixes=file_extensions,
            parser=LanguageParser()
        )
        
        # Load documents from the repo
        documents = loader.load()
        print(f"Loaded {len(documents)} documents from repository")
        
        return documents

# Set up the GitHub repository to load
repo_url = "https://github.com/langchain-ai/langchain"  # You can change this to any repo
branch = "master"  # Change if needed
file_extensions = [".py", ".md", ".txt"]  # Extensions to load

# Load documents from the GitHub repository
documents = load_github_repo(repo_url, branch, file_extensions)

# Split the documents into smaller chunks for better retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
split_docs = text_splitter.split_documents(documents)

print(f"Split into {len(split_docs)} chunks")

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

# Define the DeepLake dataset path
dataset_path = "deeplake_github_store"

# Create and load data into the DeepLake vector store
vector_store = DeepLake.from_documents(
    split_docs, 
    embeddings, 
    dataset_path=dataset_path,
    overwrite=True  # Set to False if you want to add to existing store
)

print("Vector store created successfully!")

# Create a retriever from the vector store
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}  # Return top 4 most relevant documents
)

# Initialize the language model
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0
)

# Create a question-answering chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

# Function to ask questions about the repository
def ask_repo(question):
    """
    Ask a question about the GitHub repository
    
    Args:
        question: Question to ask
    
    Returns:
        Answer from the QA chain
    """
    result = qa_chain.invoke({"query": question})
    
    print(f"\nQuestion: {question}")
    print(f"\nAnswer: {result['result']}")
    print("\nSources:")
    for i, doc in enumerate(result["source_documents"]):
        print(f"\n{i+1}. {doc.metadata.get('source', 'Unknown source')}")
        print(f"   Content: {doc.page_content[:150]}...")
    
    return result

# Example questions to ask about the repository
questions = [
    "What is this repository about?",
    "How does the LangChain framework work?",
    "What are the main components of LangChain?",
    "Show me an example of code from this repository"
]

# Ask the questions
for question in questions:
    ask_repo(question)

# Interactive mode
def interactive_qa():
    """Run an interactive Q&A session"""
    print("\n\n--- Interactive Q&A Mode ---")
    print("Type 'exit' to quit")
    
    while True:
        question = input("\nQuestion: ")
        if question.lower() in ["exit", "quit", "q"]:
            break
        
        ask_repo(question)

# Start interactive mode
interactive_qa()