In [1]:
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

This pipeline takes a given `docs.json` pulls each document out of it, converts it to chunks and outputs each chunk as a text file

Need to run the following 

```
pachctl create repo docs
pachctl put file docs@master -f ask-docs/embeddings/docs.json
```

In [2]:
docs_index_path = "/pfs/docs/docs.json" 
docs_index_schema = ".[]" # [{"body:..."}] -> .[].body; see JSONLoader docs for more info

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0,)

In [6]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["title"] = record.get("title")
    metadata["relURI"] = record.get("relURI")
    return metadata

In [15]:
loader = JSONLoader(docs_index_path, jq_schema=docs_index_schema, metadata_func=metadata_func, content_key="body") 

In [16]:
data = loader.load()

In [21]:
len(data)

402

In [23]:
data[0]

Document(page_content=' Pachyderm is a data science platform that provides data-driven pipelines with version control and autoscaling. It is container-native, allowing developers to use the languages and libraries that are best suited to their needs, and runs across all major cloud providers and on-premises installations.\nThe platform is built on Kubernetes and integrates with standard tools for CI/CD, logging, authentication, and data APIs, making it scalable and incredibly flexible. Pachyderm’s data-driven pipelines allow you to automatically trigger data processing based on changes in your data, and the platform’s autoscaling capabilities ensure that resource utilization is optimized, maximizing developer efficiency.\n', metadata={'source': '/pfs/docs/docs.json', 'seq_num': 1, 'title': 'Overview', 'relURI': '/latest/overview/'})

In [17]:
texts = text_splitter.split_documents(data) 

In [20]:
texts[0]

Document(page_content='Pachyderm is a data science platform that provides data-driven pipelines with version control and autoscaling. It is container-native, allowing developers to use the languages and libraries that are', metadata={'source': '/pfs/docs/docs.json', 'seq_num': 1, 'title': 'Overview', 'relURI': '/latest/overview/'})

In [24]:
len(texts)

4875

In [39]:
dirName = '/pfs/out/'
for idx, text in enumerate(texts):
    file_name = dirName + str(idx) + ".txt"
    #print(text.json())
    with open(file_name, 'w') as f:
        f.write(text.json())
