In [1]:
%pip install -qU langchain-text-splitters

Note: you may need to restart the kernel to use updated packages.


In [17]:
markdown_text = '''
# 🧠 Large Language Models (LLMs)

Large Language Models (LLMs) are machine learning models trained to understand and generate human language. They’re the tech behind things like ChatGPT, Claude, and Google Gemini.

## 🤖 What Makes LLMs Cool?

- Trained on massive datasets (like terabytes of text)
- Can do all kinds of tasks: writing, coding, translating, summarizing
- Powered by deep neural networks (usually Transformers)

## ✨ Popular LLMs

| Model Name     | Creator        | Params (approx.) | Notable Feature              |
|----------------|----------------|------------------|------------------------------|
| GPT-4          | OpenAI         | 1T+?             | Multimodal (text + image)    |
| Claude 3       | Anthropic      | 100B+            | Harmlessness-focused         |
| Gemini 1.5     | Google DeepMind| Unknown          | Long-context reasoning       |
| LLaMA 3        | Meta           | 8B - 400B        | Open-source friendly         |

## 🔧 Basic Use Example (Python)

```python
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")
output = generator("Once upon a time, ", max_length=30)
print(output[0]['generated_text'])

'''

In [18]:
from langchain_text_splitters import MarkdownTextSplitter
from rich import print

markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=20)

docs = markdown_splitter.create_documents([markdown_text])

print(docs)

In [22]:
python_text = '''
class Restaurant:
    def __init__(self, name: str, address: str, phone: str, menu: list[str]):
        self.name = name
        self.address = address
        self.phone = phone
        self.menu = menu

    def get_menu(self):
        print(f"The menu for {self.name} is {self.menu}")
        return self.menu

    def get_name(self):
        print(f"The name of the restaurant is {self.name}")
        return self.name

    def get_address(self):
        print(f"The address of the restaurant is {self.address}")
        return self.address

    def get_phone(self):
        print(f"The phone number of the restaurant is {self.phone}")
        return self.phone

    def get_menu(self):
        print(f"The menu for {self.name} is {self.menu}")
        return self.menu

my_restaurant = Restaurant(name="The Restaurant", address="123 Main St", phone="123-456-7890", menu=["Pizza", "Salad", "Soda"])

my_restaurant.get_menu()
my_restaurant.get_name()
my_restaurant.get_address()
my_restaurant.get_phone()
'''

In [23]:
from langchain_text_splitters import PythonCodeTextSplitter
from rich import print

python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=20)
docs = python_splitter.create_documents([python_text])
print(docs)

In [24]:
javascript_text = '''
class Restaurant {
    constructor(name, address, phone, menu) {
        this.name = name;
        this.address = address;
        this.phone = phone;
        this.menu = menu;
    }

    getMenu() {
        console.log(`The menu for ${this.name} is ${this.menu}`);
        return this.menu;
    }

    getName() {
        console.log(`The name of the restaurant is ${this.name}`);
        return this.name;
    }

    getAddress() {
        console.log(`The address of the restaurant is ${this.address}`);
        return this.address;
    }

    getPhone() {
        console.log(`The phone number of the restaurant is ${this.phone}`);
        return this.phone;
    }
}


const myRestaurant = new Restaurant("The Restaurant", "123 Main St", "123-456-7890", ["Pizza", "Salad", "Soda"]);

myRestaurant.getMenu();
myRestaurant.getName();
myRestaurant.getAddress();
myRestaurant.getPhone();
'''

In [26]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from rich import print

javascript_splitter = RecursiveCharacterTextSplitter.from_language(chunk_size=100, chunk_overlap=20, language=Language.JS)
docs = javascript_splitter.create_documents([javascript_text])
print(docs)