# Analyze Langchain Code Base with Langchain

In [6]:
# ONLY USE IF KEY IS SAVED IN FILE
import os

# Change this path to your key location
path_to_key = "../openai-key.txt"

with open(path_to_key) as fo:
    key = fo.readline()
    
os.environ["OPENAI_API_KEY"] = key.strip()

In [2]:
from langchain.document_loaders import TextLoader

root_dir = '../langchain-code-base'

docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith('.py') and '/.venv/' not in dirpath:
            try: 
                loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
                docs.extend(loader.load_and_split())
            except Exception as e: 
                pass
print(f'{len(docs)}')

1064


In [3]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
print(f"{len(texts)}")

Created a chunk of size 1213, which is longer than the specified 1000
Created a chunk of size 1263, which is longer than the specified 1000
Created a chunk of size 1620, which is longer than the specified 1000
Created a chunk of size 1273, which is longer than the specified 1000
Created a chunk of size 1573, which is longer than the specified 1000
Created a chunk of size 1923, which is longer than the specified 1000
Created a chunk of size 1783, which is longer than the specified 1000
Created a chunk of size 1555, which is longer than the specified 1000
Created a chunk of size 1082, which is longer than the specified 1000
Created a chunk of size 1225, which is longer than the specified 1000
Created a chunk of size 1877, which is longer than the specified 1000
Created a chunk of size 1849, which is longer than the specified 1000
Created a chunk of size 1124, which is longer than the specified 1000
Created a chunk of size 1387, which is longer than the specified 1000
Created a chunk of s

3164


In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6)

In [10]:
from langchain.vectorstores import DeepLake

db = DeepLake.from_documents(texts, embeddings)
db

./deeplake/ loaded successfully.



Deep Lake Dataset in ./deeplake/ already exists, loading from the storage


Dataset(path='./deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype    shape    dtype  compression
  -------   -------  -------  -------  ------- 
 embedding  generic   (0,)    float32   None   
    ids      text     (0,)      str     None   
 metadata    json     (0,)      str     None   
   text      text     (0,)      str     None   


Evaluating ingest: 100%|██████████| 4/4 [00:41<00:00


Dataset(path='./deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])

  tensor     htype      shape       dtype  compression
  -------   -------    -------     -------  ------- 
 embedding  generic  (3164, 1536)  float32   None   
    ids      text     (3164, 1)      str     None   
 metadata    json     (3164, 1)      str     None   
   text      text     (3164, 1)      str     None   




<langchain.vectorstores.deeplake.DeepLake at 0x18ae5a88950>

In [11]:
retriever = db.as_retriever()
retriever.search_kwargs['distance_metric'] = 'cos'
retriever.search_kwargs['fetch_k'] = 20
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 20

In [12]:
def filter(x):
    # filter based on source code
    if 'something' in x['text'].data()['value']:
        return False
    
    # filter based on path e.g. extension
    metadata =  x['metadata'].data()['value']
    return 'only_this' in metadata['source'] or 'also_that' in metadata['source']

### turn on below for custom filtering
# retriever.search_kwargs['filter'] = filter

In [13]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

model = ChatOpenAI(model='gpt-3.5-turbo') # 'ada' 'gpt-3.5-turbo' 'gpt-4',
qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)

In [14]:

questions = [
    "What is the class hierarchy?",
    # "What classes are derived from the Chain class?",
    # "What classes and functions in the ./langchain/utilities/ forlder are not covered by unit tests?",
    # "What one improvement do you propose in code in relation to the class herarchy for the Chain class?",
] 
chat_history = []

for question in questions:  
    result = qa({"question": question, "chat_history": chat_history})
    chat_history.append((question, result['answer']))
    print(f"-> **Question**: {question} \n")
    print(f"**Answer**: {result['answer']} \n")

-> **Question**: What is the class hierarchy? 

**Answer**: There are several class hierarchies in the provided context, which one are you specifically interested in? 



In [16]:
question = "The Base Model."
result = qa({"question":question, "chat_history":chat_history})
chat_history.append((question, result['answer']))
print(f"-> **Question**: {question} \n")
print(f"**Answer**: {result['answer']} \n")

-> **Question**: The Base Model. 

**Answer**: The Base Model is a class provided by the Pydantic library. It is not clear which Base Model you are referring to. However, based on the context provided, there are several classes that inherit from the Pydantic BaseModel. Here are some of them:

- ConstitutionalPrinciple
- EmbeddingStore
- Chain
- TracerSessionBase
- BaseRun
- APIRequestBody
- AttributeInfo
- HuggingFaceEmbeddings

The Base Model is used as a base class for creating data models with validation, serialization and deserialization capabilities. 



In [17]:
question = "Write Python code to get predictions from a simple LLM in langchain"
result = qa({"question":question, "chat_history":chat_history})
chat_history.append((question, result['answer']))
print(f"-> **Question**: {question} \n")
print(f"**Answer**: {result['answer']} \n")

-> **Question**: Write Python code to get predictions from a simple LLM in langchain 

**Answer**: Here's an example of how you can create an instance of an LLM and generate text:

```
from langchain.llms import OpenAI

llm = OpenAI()
text = "Hello, how are you doing today?"
output = llm.generate(text)
print(output)
```

Note that you can use other LLMs as well by importing them and initializing an instance of the class. 



In [21]:
question = "Write Python code that uses Langchain to get predictions for the name of a company that sells fuzzy socks."
result = qa({"question":question, "chat_history":""})
#chat_history.append((question, result['answer']))
print(f"-> **Question**: {question} \n")
print(f"**Answer**: {result['answer']} \n")

-> **Question**: Write Python code that uses Langchain to get predictions for the name of a company that sells fuzzy socks. 

**Answer**: Here's an example of how to use Langchain to generate text:

```python
from langchain.llms import OpenAI
from langchain.agents import initialize_agent, AgentType

llm = OpenAI(model_name="text-davinci-002")

# Define a prompt to generate the text
prompt = "What is the name of a company that sells fuzzy socks?"

# Generate text using Langchain
output = llm(prompt)

# Print the generated text
print(output)
```

This will use the OpenAI API to generate text based on the given prompt. The output will be the generated text, which will hopefully include the name of a company that sells fuzzy socks. Note that the quality of the output will depend on the specific language model being used and the quality of the prompt. 

