In [1]:
from typing import List, Tuple
import requests

json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

text_data = """
Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. 
It involves "the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources."
[1] Written resources may include websites, books, emails, reviews, and articles. 
High-quality information is typically obtained by devising patterns and trends by means such as statistical pattern learning. 
According to Hotho et al. (2005) we can distinguish between three different perspectives of text mining: information extraction, data mining, 
and a knowledge discovery in databases (KDD) process.
[2] Text mining usually involves the process of structuring the input text (usually parsing, 
along with the addition of some derived linguistic features and the removal of others, 
and subsequent insertion into a database), deriving patterns within the structured data, and finally evaluation and interpretation of the output. 
'High quality' in text mining usually refers to some combination of relevance, novelty, and interest. 
Typical text mining tasks include text categorization, text clustering, concept/entity extraction, 
production of granular taxonomies, sentiment analysis, document summarization, and entity relation modeling (i.e., learning relations between named entities).
"""

markdown_data = "# Foo\n\n    ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly"

html_data = """

<h2 id="data">Data</h2>

<p>I searched GitHub using the keywords <code class="language-plaintext highlighter-rouge">gpt</code>, <code class="language-plaintext highlighter-rouge">llm</code>, and <code class="language-plaintext highlighter-rouge">generative ai</code>. If AI feels so overwhelming right now, it’s because it is. There are 118K results for <code class="language-plaintext highlighter-rouge">gpt</code> alone.</p>

<p>To make my life easier, I limited my search to the repos with at least 500 stars. There were 590 results for <code class="language-plaintext highlighter-rouge">llm</code>, 531 for <code class="language-plaintext highlighter-rouge">gpt</code>, and 38 for <code class="language-plaintext highlighter-rouge">generative ai</code>. I also occasionally checked GitHub trending and social media for new repos.</p>

<p>After MANY hours, I found 896 repos. Of these, 51 are tutorials (e.g. <a href="https://github.com/dair-ai/Prompt-Engineering-Guide">dair-ai/Prompt-Engineering-Guide</a>) and aggregated lists (e.g. <a href="https://github.com/f/awesome-chatgpt-prompts">f/awesome-chatgpt-prompts</a>). While these tutorials and lists are helpful, I’m more interested in software. I still include them in the final list, but the analysis is done with the 845 software repositories.</p>

<p>It was a painful but rewarding process. It gave me a much better understanding of what people are working on, how incredibly collaborative the open source community is, and just how much China’s open source ecosystem diverges from the Western one.</p>

<h3 id="add_missing_repos">Add missing repos</h3>

<p>I undoubtedly missed a ton of repos. You can submit the missing repos <a href="https://forms.gle/1ijNSnizgWQaVYK16">here</a>. The list will be automatically updated every day.</p>

<p>Feel free to submit the repos with less than 500 stars. I’ll continue tracking them and add them to the list when they reach 500 stars!</p>

<h2 id="the_new_ai_stack">The New AI Stack</h2>

<p>I think of the AI stack as consisting of 4 layers: infrastructure, model development, application development, and applications.</p>

<center>
    <figure>
    <img alt="Generative AI Stack" src="/assets/pics/ai-oss/1-ai-stack.png" style="float: center; max-width: 100%; margin: 0 0 0em 0em" />
    </figure>
</center>
<p><br /></p>

<ol>
  <li>
    <p><strong>Infrastructure</strong></p>

    <p>At the bottom is the stack is infrastructure, which includes toolings for serving (<a href="https://github.com/vllm-project/vllm">vllm</a>, <a href="https://github.com/triton-inference-server/server">NVIDIA’s Triton</a>), compute management (<a href="https://github.com/skypilot-org/skypilot">skypilot</a>), vector search and database (<a href="https://github.com/facebookresearch/faiss">faiss</a>, <a href="https://milvus.io/">milvus</a>, <a href="https://github.com/qdrant/qdrant">qdrant</a>, <a href="https://github.com/lancedb/lancedb">lancedb</a>), ….</p>
  </li>
  <li>
    <p><strong>Model development</strong></p>

    <p>This layer provides toolings for developing models, including frameworks for modeling &amp; training (transformers, pytorch, DeepSpeed), inference optimization (ggml, openai/triton), dataset engineering, evaluation, ….. Anything that involves changing a model’s weights happens in this layer, including finetuning.</p>
  </li>
  <li>
    <p><strong>Application development</strong>
 With readily available models, anyone can develop applications on top of them. This is the layer that has seen the most actions in the last 2 years and is still rapidly evolving. This layer is also known as AI engineering.</p>

    <p>Application development involves prompt engineering, RAG, AI interface, …</p>
  </li>
  <li>
    <p><strong>Applications</strong></p>

    <p>There are many open sourced applications built on top of existing models. The most popular types of applications are coding, workflow automation, information aggregation, …</p>
  </li>
</ol>
"""


code_data = """
def sample_top_p(probs: torch.Tensor, p: float):
    assert 0 <= p <= 1

    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
    probs_sum = torch.cumsum(probs_sort, dim=-1)
    mask = probs_sum - probs_sort > p
    probs_sort[mask] = 0.0
    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
    next_token = torch.multinomial(probs_sort, num_samples=1)
    return torch.gather(probs_idx, -1, next_token)


def sample(logits: torch.Tensor, temperature: float, top_p: float):
    if temperature > 0:
        probs = torch.softmax(logits / temperature, dim=-1)
        next_token = sample_top_p(probs, top_p)
    else:
        next_token = torch.argmax(logits, dim=-1).unsqueeze(0)

    return next_token.reshape(-1)
"""

# 1. Character Splitter

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
chunk_size = 300
chunk_overlap = 30
separators: List[str] = ['\n\n', '\n', ' ', '']

char_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_separator_regex=False,
    separators=separators
)

## 1.1 Split text

In [4]:
text_out = char_splitter.split_text(text_data)
len(text_out)

6

In [5]:
text_out[0]

'Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. \nIt involves "the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources."'

## 1.2 Create documents

In [6]:
docs = char_splitter.create_documents(text_out)
len(docs)

6

In [7]:
docs

[Document(page_content='Text mining, text data mining (TDM) or text analytics is the process of deriving high-quality information from text. \nIt involves "the discovery by computer of new, previously unknown information, by automatically extracting information from different written resources."'),
 Document(page_content='[1] Written resources may include websites, books, emails, reviews, and articles. \nHigh-quality information is typically obtained by devising patterns and trends by means such as statistical pattern learning.'),
 Document(page_content='According to Hotho et al. (2005) we can distinguish between three different perspectives of text mining: information extraction, data mining, \nand a knowledge discovery in databases (KDD) process.\n[2] Text mining usually involves the process of structuring the input text (usually parsing,'),
 Document(page_content='along with the addition of some derived linguistic features and the removal of others, \nand subsequent insertion into a

# 1.3 Split Documents

In [32]:
small_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=0,
)
small_docs = small_splitter.split_documents(docs)
len(small_docs)

23

In [33]:
small_docs[0]

Document(page_content='def sample_top_p(probs: torch.Tensor, p: float):')

# 2. JSON Splitter

In [10]:
from langchain_text_splitters import RecursiveJsonSplitter

In [11]:
max_chunk_size = 300
json_splitter = RecursiveJsonSplitter(max_chunk_size=max_chunk_size)

## 2.1 Split json

In [12]:
json_out = json_splitter.split_json(json_data)
len(json_out)

1139

In [13]:
json_out[0]

{'openapi': '3.1.0',
 'info': {'title': 'LangSmith', 'version': '0.1.0'},
 'servers': [{'url': 'https://api.smith.langchain.com',
   'description': 'LangSmith API endpoint.'}]}

## 2.2 Split as text


In [14]:
text_out = json_splitter.split_text(json_data)
len(text_out)

2277

In [15]:
text_out[0]

'{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "servers": [{"url": "https://api.smith.langchain.com", "description": "LangSmith API endpoint."}]}'

## 2.3 Create documents

In [16]:
docs = json_splitter.create_documents(texts=[json_data])
len(docs)

3415

In [17]:
docs[0]

Document(page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "servers": [{"url": "https://api.smith.langchain.com", "description": "LangSmith API endpoint."}]}')

In [18]:
docs[0].page_content

'{"openapi": "3.1.0", "info": {"title": "LangSmith", "version": "0.1.0"}, "servers": [{"url": "https://api.smith.langchain.com", "description": "LangSmith API endpoint."}]}'

# 3. Markdown Splitter

In [19]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [20]:
headers_to_split_on: List[tuple] = [("#", "Header 1"),
                                    ("##", "Header 2"),
                                    ("###", "Header 3"),]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [21]:
docs = markdown_splitter.split_text(text=markdown_data)
len(docs)

3

In [22]:
docs[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Foo', 'Header 2': 'Bar'})

# 4. HTML Splitter

In [23]:
from langchain_text_splitters import HTMLHeaderTextSplitter

In [24]:
headers_to_split_on: List[Tuple[str, str]] = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

In [25]:
docs = html_splitter.split_text(text=html_data)
len(docs)

3

In [26]:
docs[0]

Document(page_content='I searched GitHub using the keywords gpt, llm, and generative ai. If AI feels so overwhelming right now, it’s because it is. There are 118K results for gpt alone.  \nTo make my life easier, I limited my search to the repos with at least 500 stars. There were 590 results for llm, 531 for gpt, and 38 for generative ai. I also occasionally checked GitHub trending and social media for new repos.  \nAfter MANY hours, I found 896 repos. Of these, 51 are tutorials (e.g. dair-ai/Prompt-Engineering-Guide) and aggregated lists (e.g. f/awesome-chatgpt-prompts). While these tutorials and lists are helpful, I’m more interested in software. I still include them in the final list, but the analysis is done with the 845 software repositories.  \nIt was a painful but rewarding process. It gave me a much better understanding of what people are working on, how incredibly collaborative the open source community is, and just how much China’s open source ecosystem diverges from the Wes

# 4. Code Splitter

In [27]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language

In [28]:
langaue_dict = {e.value: e for e in Language}
langaue_dict

{'cpp': <Language.CPP: 'cpp'>,
 'go': <Language.GO: 'go'>,
 'java': <Language.JAVA: 'java'>,
 'kotlin': <Language.KOTLIN: 'kotlin'>,
 'js': <Language.JS: 'js'>,
 'ts': <Language.TS: 'ts'>,
 'php': <Language.PHP: 'php'>,
 'proto': <Language.PROTO: 'proto'>,
 'python': <Language.PYTHON: 'python'>,
 'rst': <Language.RST: 'rst'>,
 'ruby': <Language.RUBY: 'ruby'>,
 'rust': <Language.RUST: 'rust'>,
 'scala': <Language.SCALA: 'scala'>,
 'swift': <Language.SWIFT: 'swift'>,
 'markdown': <Language.MARKDOWN: 'markdown'>,
 'latex': <Language.LATEX: 'latex'>,
 'html': <Language.HTML: 'html'>,
 'sol': <Language.SOL: 'sol'>,
 'csharp': <Language.CSHARP: 'csharp'>,
 'cobol': <Language.COBOL: 'cobol'>,
 'c': <Language.C: 'c'>,
 'lua': <Language.LUA: 'lua'>,
 'perl': <Language.PERL: 'perl'>}

In [29]:
chunk_size = 300
chunk_overlap = 30

code_splitter = RecursiveCharacterTextSplitter.from_language(
    language = langaue_dict["python"],
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap
)


In [30]:
docs = code_splitter.create_documents(texts=[code_data])
len(docs)

5

In [31]:
docs[0]

Document(page_content='def sample_top_p(probs: torch.Tensor, p: float):\n    assert 0 <= p <= 1')