Create your own MAP Reduce where map is parallel and reduce is sequential

In [1]:
import os
from langchain_community.chat_models import ChatOpenAI
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain import hub
from langchain_core.prompts.prompt import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate


In [2]:
def load_docs(path):
        all_docs = []
        abs = os.path.normpath(os.path.abspath(path))  # Normalize and convert root_path to an absolute path

        for subdir, _, _ in os.walk(abs):
            loader = DirectoryLoader(os.path.join(abs, subdir), glob="./*.md", show_progress=True, loader_cls=UnstructuredMarkdownLoader)    
            docs = loader.load()
            all_docs.extend(docs)
        return all_docs


In [3]:
def split_documents(doc, chunk_size=250, chunk_overlap=30):
        """ Split a document into chunks of text."""

        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ]

        markdown_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=headers_to_split_on, strip_headers=False
        )
        md_header_splits = markdown_splitter.split_text( doc.page_content)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )

        # Split
        splits = text_splitter.split_documents(md_header_splits)
    
        return splits

In [4]:
docs = load_docs("C:\\Users\\reply\\RepoAgent\\markdown_docs")

0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 2/2 [00:24<00:00, 12.44s/it]
100%|██████████| 13/13 [00:07<00:00,  1.67it/s]
100%|██████████| 6/6 [00:02<00:00,  2.69it/s]
100%|██████████| 6/6 [00:02<00:00,  2.90it/s]
100%|██████████| 2/2 [00:00<00:00,  3.07it/s]
100%|██████████| 8/8 [00:02<00:00,  3.47it/s]


In [53]:
all_splits = []
for doc in docs:
        splits = split_documents(doc, chunk_size=5000, chunk_overlap=0)
        for doc_split in splits:
            filename = os.path.basename(list(doc.metadata.values())[0])
            doc_split.metadata = {'source':filename}        
        all_splits.extend(splits)

In [52]:

# Define prompt
prompt_template = """Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

llm_chain = LLMChain(llm=llm, prompt=prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")



In [60]:
def get_summary(stuff_chain, docs):
    summary = []
    for doc in docs:
        summary.extend(stuff_chain.invoke([doc])["output_text"])
    return summary

In [47]:
len(docs)

37

In [None]:
len()

In [54]:
len(all_splits)

114

s 1500 -> 8.38 min
s 5000 -> 3.59 min
mr -> 2.26 min
s 5000 + parallel -> 18 sec 

reduce: 
4sec 

In [56]:
len(all_splits)

114

In [55]:
get_summary(stuff_chain, all_splits)

The main function copies Markdown documentation files from a specified folder to a destination folder, creates a README.md file if it does not exist, and organizes the copied files accordingly. It checks if the destination directory exists and creates it if necessary. It then iterates through the items in the source directory and copies them to the destination directory. After copying all the files, it checks if a README.md file exists in the destination directory and creates one if necessary. The create_book_readme_if_not_exist function creates a README.md file in the specified directory if it does not already exist.
The given code includes four functions: create_readme_if_not_exist, output_markdown, markdown_file_in_dir, and is_markdown_file. 

The create_readme_if_not_exist function creates a README.md file in a specified directory if it does not already exist. 

The output_markdown function generates a markdown summary of files and directories within a specified directory, includin

In [51]:
len(all_splits)

1174

In [57]:
import concurrent.futures


In [67]:
def get_parallel_summary(stuff_chain, docs):
    def process_document_with_chain(doc):
        return stuff_chain.invoke([doc])["output_text"]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Map the process_document_with_chain function to the documents
        results = list(executor.map(process_document_with_chain, docs))

    
    return results

In [68]:
summary = get_parallel_summary(stuff_chain, all_splits)

In [64]:
reduce_template = """The following is a set of summaries: {docs}
        Please distill these summaries into a final, consolidated summary of the overall contents. Ensure the final summary captures the main points and key details from each document.
        Helpful Answer:
    
        The standard format is as follows:

        # title: 
        ** Project Description:**  summary of the project

        Please note:
        - Write mainly in the desired language. If necessary, you can write with some english words in the analysis and description to enhance the document's readability because you do not need to translate the function name or variable name into the target language.

        """
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

In [72]:
reduce_chain.invoke(summary)["text"]

'# title: Code Documentation Generation Project\n\n**Project Description:** This project focuses on generating documentation for code objects in a repository. It involves various functionalities such as copying Markdown files, creating README.md files, organizing files, analyzing file differences, identifying changes in structure, retrieving unstaged files, extracting import statements, generating documentation for functions or classes, handling errors, and managing metadata. The project utilizes libraries and tools such as GitPython, OpenAI, Gradio, and ChromaDB. The code includes classes and functions for different tasks, including file handling, text analysis, chat interactions, and document summarization. Unit tests are provided to ensure the functionality of the implemented methods. The project aims to automate the process of generating comprehensive and accurate documentation for code objects in a repository.'