In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Data**

In [4]:
import requests

In [5]:
url = "https://raw.githubusercontent.com/progit/progit2/main/book/01-introduction/sections/what-is-git.asc"
src_text = requests.get(url).text

In [6]:
len(src_text)

8069

In [9]:
print(src_text[:1000])

[[what_is_git_section]]
=== What is Git?

So, what is Git in a nutshell?
This is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.
As you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.
Even though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))

==== Snapshots, Not Differences

The major difference between Git and any other VCS (Subversion and friends included) is the way Git thinks about its data.
Conceptually, most other systems store information as a list of file-based changes.
These other systems (CVS, Subversion, Perforce, and so o

**Fixed-Size Chunking**

**Without Overlap**

In [10]:
def chunks_fix_size(text, chunk_size):
    words = text.split()

    chunks = []
    for i in range(0,len(words),chunk_size):
        chunk_words = words[i:i+chunk_size]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)
    return chunks

In [11]:
fix_size_chunks = chunks_fix_size(src_text,100)
len(fix_size_chunks)

15

In [12]:
fix_size_chunks[0:3]

["[[what_is_git_section]] === What is Git? So, what is Git in a nutshell? This is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you. As you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool. Even though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in",
 'a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce))) ==== Snapshots, Not Differences The major difference between Git and any other VCS (Subversion and friends included) is the way Git thinks about its data. Conceptually, most other systems store information as a list of file-based changes. These other systems (CVS, Subversion, Perforce, and s

**With Overlap**

In [13]:
def chunks_fix_size_overlap(text, chunk_size, ov_fraction):
    words = text.split()
    overlap_int = int(chunk_size * ov_fraction)
    
    chunks = []
    for i in range(0,len(words),chunk_size):
        chunk_words = words[max(0,i-overlap_int):i+chunk_size]
        chunk = " ".join(chunk_words)
        chunks.append(chunk)
    return chunks

In [14]:
for chosen_size in [5,25,100]:
    chunks = chunks_fix_size_overlap(src_text,chosen_size,0.2)
    print(f"\nSize {chosen_size} - {len(chunks)} chunks returned.")
    for i in range(3):
        print(f"Chunk {i+1}: {chunks[i]}")


Size 5 - 281 chunks returned.
Chunk 1: [[what_is_git_section]] === What is Git?
Chunk 2: Git? So, what is Git in
Chunk 3: in a nutshell? This is an

Size 25 - 57 chunks returned.
Chunk 1: [[what_is_git_section]] === What is Git? So, what is Git in a nutshell? This is an important section to absorb, because if you understand what Git
Chunk 2: if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you. As you learn Git, try to
Chunk 3: you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid

Size 100 - 15 chunks returned.
Chunk 1: [[what_is_git_section]] === What is Git? So, what is Git in a nutshell? This is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you. As you learn Git, try to clear your mind of

**Variable Size Chunking - Recursive Character Splitting**

In [15]:
def get_chunks_para(src_text):
    return src_text.split("\n\n")

In [16]:
chunks = get_chunks_para(src_text)
print(chunks[:2])

['[[what_is_git_section]]\n=== What is Git?', "So, what is Git in a nutshell?\nThis is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.\nAs you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.\nEven though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))"]


In [17]:
def get_chunks_asciidoc_sections(src_text):
    return src_text.split("\n==")

In [18]:
chunks = get_chunks_asciidoc_sections(src_text)
chunks[:2]

['[[what_is_git_section]]',
 "= What is Git?\n\nSo, what is Git in a nutshell?\nThis is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.\nAs you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.\nEven though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))\n"]

In [19]:
for marker in ["\n\n", "\n=="]:
    chunks = src_text.split(marker)
    # Print outputs to screen
    print(f"\nUsing the marker: {repr(marker)} - {len(chunks)} chunks returned.")
    for i in range(3):
        print(f"Chunk {i+1}: {repr(chunks[i])}")


Using the marker: '\n\n' - 31 chunks returned.
Chunk 1: '[[what_is_git_section]]\n=== What is Git?'
Chunk 2: "So, what is Git in a nutshell?\nThis is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.\nAs you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.\nEven though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))"
Chunk 3: '==== Snapshots, Not Differences'

Using the marker: '\n==' - 7 chunks returned.
Chunk 1: '[[what_is_git_section]]'
Chunk 2: "= What is Git?\n\nSo, what is Git in a nutshell?\nThis is an important section to absorb,

**Mixing Fixed and Variable-sized Chunking**

In [20]:
def mixed_chunking(src_text):
    chunks = src_text.split("\n==")

    new_chunks = []
    chunk_buffer = ""
    min_len = 25

    for chunk in chunks:
        new_buffer = chunk_buffer + chunk
        new_b_words = new_buffer.split(" ")
        if len(new_b_words)<min_len:
            chunk_buffer = new_buffer
        else:
            new_chunks.append(new_buffer)
            chunk_buffer = ""

    if len(chunk_buffer)>0:
        new_chunks.append(chunk_buffer)
    return new_chunks

In [21]:
mixed_chunks = mixed_chunking(src_text)
for i in range(3):
    print(f"Chunk {i+1}: {repr(mixed_chunks[i])}")

Chunk 1: "[[what_is_git_section]]= What is Git?\n\nSo, what is Git in a nutshell?\nThis is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.\nAs you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.\nEven though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))\n"
Chunk 2: "== Snapshots, Not Differences\n\nThe major difference between Git and any other VCS (Subversion and friends included) is the way Git thinks about its data.\nConceptually, most other systems store information as a list of file-based changes.\nThese other systems (CVS, Subv

**Chunking on real data**

**Getting Data**

In [22]:
def get_book_txt_objs():
    text_objs = list()
    api_base_url = 'https://api.github.com/repos/progit/progit2/contents/book'  # Book base URL
    chapter_urls = ['/01-introduction/sections', '/02-git-basics/sections']

    for chap_url in chapter_urls:
        res = requests.get(api_base_url+chap_url)

        for file_info in res.json():
            if file_info['type'] == 'file':
                file_res = requests.get(file_info['download_url'])

                chap_title = file_info['download_url'].split('/')[-3]
                filename = file_info['download_url'].split('/')[-1]
                text_obj = {
                    "body": file_res.text,
                    "chapter_title": chap_title,
                    "filename": filename
                }
                text_objs.append(text_obj)
    return text_objs

In [23]:
book_text_objs = get_book_txt_objs()

In [27]:
book_text_objs[0].keys()

dict_keys(['body', 'chapter_title', 'filename'])

**Chunking Chapters**

In [30]:
def build_chunk_objs(book_text_obj,chunks):
    chunk_objs = list()

    for i,c in enumerate(chunks):
        chunk_obj = {
            "chapter_title": book_text_obj["chapter_title"],
            "filename": book_text_obj["filename"],
            "chunk": c,
            "chunk_index": i
        }

        chunk_objs.append(chunk_obj)
    return chunk_objs

In [32]:
chunk_obj_sets = dict()
for book_text_obj in book_text_objs:
    text = book_text_obj["body"]

    for strat_name, chunks in [
        ["fixed_size_25",chunks_fix_size_overlap(text,25,0.2)],
        ["fixed_size_100",chunks_fix_size_overlap(text,100,0.2)],
        ["para_chunks",get_chunks_para(text)],
        ["para_chunks_min_25",mixed_chunking(text)]
    ]:
        chunk_objs = build_chunk_objs(book_text_obj,chunks)

        if strat_name not in chunk_obj_sets.keys():
            chunk_obj_sets[strat_name] = list()
        chunk_obj_sets[strat_name]+=chunk_objs

In [33]:
chunk_obj_sets.keys()

dict_keys(['fixed_size_25', 'fixed_size_100', 'para_chunks', 'para_chunks_min_25'])

In [34]:
chunk_type = 'fixed_size_25'
chunk_obj_sets[chunk_type][:2]

[{'chapter_title': '01-introduction',
  'filename': 'about-version-control.asc',
  'chunk': '=== About Version Control (((version control))) What is "`version control`", and why should you care? Version control is a system that records changes to a',
  'chunk_index': 0},
 {'chapter_title': '01-introduction',
  'filename': 'about-version-control.asc',
  'chunk': 'that records changes to a file or set of files over time so that you can recall specific versions later. For the examples in this book, you will use software',
  'chunk_index': 1}]

In [36]:
chunk_type = 'para_chunks'
chunk_obj_sets[chunk_type][:2]

[{'chapter_title': '01-introduction',
  'filename': 'about-version-control.asc',
  'chunk': '=== About Version Control',
  'chunk_index': 0},
 {'chapter_title': '01-introduction',
  'filename': 'about-version-control.asc',
  'chunk': '(((version control)))\nWhat is "`version control`", and why should you care?\nVersion control is a system that records changes to a file or set of files over time so that you can recall specific versions later.\nFor the examples in this book, you will use software source code as the files being version controlled, though in reality you can do this with nearly any type of file on a computer.',
  'chunk_index': 1}]

In [37]:
chunk_type = 'para_chunks_min_25'
chunk_obj_sets[chunk_type][:2]

[{'chapter_title': '01-introduction',
  'filename': 'about-version-control.asc',
  'chunk': '=== About Version Control\n\n(((version control)))\nWhat is "`version control`", and why should you care?\nVersion control is a system that records changes to a file or set of files over time so that you can recall specific versions later.\nFor the examples in this book, you will use software source code as the files being version controlled, though in reality you can do this with nearly any type of file on a computer.\n\nIf you are a graphic or web designer and want to keep every version of an image or layout (which you would most certainly want to), a Version Control System (VCS) is a very wise thing to use.\nIt allows you to revert selected files back to a previous state, revert the entire project back to a previous state, compare changes over time, see who last modified something that might be causing a problem, who introduced an issue and when, and more.\nUsing a VCS also generally means t

**Loading Chunks into a Vector Database**

In [43]:
%pip install weaviate
%pip install --upgrade weaviate-client

Note: you may need to restart the kernel to use updated packages.
Collecting weaviate-client
  Downloading weaviate_client-4.16.6-py3-none-any.whl.metadata (3.7 kB)
Collecting validators<1.0.0,>=0.34.0 (from weaviate-client)
  Downloading validators-0.35.0-py3-none-any.whl.metadata (3.9 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client)
  Downloading authlib-1.6.1-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting grpcio-health-checking<1.80.0,>=1.59.5 (from weaviate-client)
  Downloading grpcio_health_checking-1.74.0-py3-none-any.whl.metadata (1.0 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from weaviate-client)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting protobuf<7.0.0,>=6.31.1 (from grpcio-health-checking<1.80.0,>=1.59.5->weaviate-client)
  Downloading protobuf-6.31.1-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting grpcio<1.80.0,>=1.59.5 (from weaviate-client)
  Downloading grpcio-1.74.0-cp311-cp311-manylinux_2_17_x86_64

In [44]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType, Tokenization
from weaviate.util import generate_uuid5
from weaviate.classes.query import Filter
import tqdm



In [45]:

# Loading the client

try:
    client = weaviate.connect_to_embedded(
        persistence_data_path="/home/jovyan/data/collections/m3/ungraded_lab_2",
        environment_variables={
            "ENABLE_API_BASED_MODULES": "true", # Enable API based modules 
            "ENABLE_MODULES": 'text2vec-transformers', # We will be using a transformer model 
            "TRANSFORMERS_INFERENCE_API":"http://127.0.0.1:5000/", # The endpoint the weaviate API will be using to vectorize
        }
    )
except Exception as e:
    ports = extract_ports(str(e))
    client = weaviate.connect_to_local(port=8079, grpc_port=50050)

{"action":"startup","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"Feature flag LD integration disabled: could not locate WEAVIATE_LD_API_KEY env variable","time":"2025-08-09T16:29:42Z"}
{"action":"startup","build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2025-08-09T16:29:42Z"}
{"action":"startup","auto_schema_enabled":{},"build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"","build_wv_version":"1.30.5","level":"info","msg":"auto schema enabled setting is set to \"\u0026{\u003cnil\u003e {{{} {0 0}} 0 0 {{} 0} {{} 0}} true}\"","time":"2025-08-09T16:29:42Z"}
{"build_git_commit":"","build_go_version":"go1.24.3","build_image_tag":"",

KeyboardInterrupt: 