# Code QA
Use this template to ask questions about any GitHub repository

In [None]:
%setup langchain deeplake openai tiktoken

In [None]:
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [None]:
!git clone https://github.com/twitter/the-algorithm # replace any repository of your choice 

In [None]:
root_dir = './the-algorithm'
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        try: 
            loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
            docs.extend(loader.load_and_split())
        except Exception as e: 
            pass
text_splitter = CharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [None]:
embeddings = OpenAIEmbeddings()
db = DeepLake.from_documents(texts, embeddings)

In [None]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 100
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 20

def filter(x):
    # filter based on source code
    if 'com.google' in x['text'].data()['value']:
        return False
    
    # filter based on path e.g. extension
    metadata =  x['metadata'].data()['value']
    return 'scala' in metadata['source'] or 'py' in metadata['source']

In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", input_key="question")
model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever, memory=memory, get_chat_history=lambda inputs: inputs)