## Using GitLoader (Langchain) to load git repos.

In [17]:
from langchain.document_loaders import GitLoader
from langchain.schema import Document
import json
from typing import Iterable

In [18]:
# The original approach loads too many files. 
# Instead, focus on Tcl, Cpp files

cpp_exts = ['.cpp', 'cc', 'c++',
            'hpp', 'hh', 'h++', 'h']
other_exts = ['.tcl', '.i', '.py', '.md']
exts = cpp_exts + other_exts

loader = GitLoader(
    clone_url="https://github.com/The-OpenROAD-Project/OpenROAD",
    repo_path="./data",
    branch="master",
    file_filter = lambda file_path: any(file_path.endswith(ext) for ext in exts)
)

data = loader.load_and_split()
data

 Document(page_content="### 3. Temporary Ban\n\n**Community Impact**: A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence**: A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact**: Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior,  harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence**: A permanent ban from any sort of public interaction within\nthe community.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage],\nversion 2.0, available at\nhttps://ww

In [19]:
len(data)

7037

In [20]:
# Code is adapted from https://github.com/langchain-ai/langchain/issues/3016
def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
    with open(file_path, 'w') as jsonl_file:
        for doc in array:
            jsonl_file.write(doc.json() + '\n')

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array
    
save_docs_to_jsonl(data,'tempdata/data.jsonl')
data2=load_docs_from_jsonl('tempdata/data.jsonl')
assert len(data) == len(data2)
