# Chroma Ruby Gem

## Database

Database operations for Chroma

In [1]:
require "bundler/inline"

gemfile do
  gem "chroma-db", path: "../"
end

""

In [2]:
require "logger"
require "json"
require "securerandom"

# Requiere Chroma Ruby client.
#require "chroma-db"

# Configure Chroma's host. Here you can specify your own host.
Chroma.connect_host = "http://localhost:8000"
Chroma.logger = Logger.new($stdout)
Chroma.log_level = Chroma::LEVEL_INFO

# Check connection with Database's heartbeat
response = Chroma::Resources::Database.heartbeat

IRuby.display "Heartbear timestamp #{response["nanosecond heartbeat"]}"

# Check current Chrome server version
version = Chroma::Resources::Database.version

IRuby.display "Chroma server version #{version}"

# Reset database (DANGER: This deletes all previos data)
# Chroma::Resources::Database.reset

I, [2024-12-11T13:38:10.661531 #16507]  INFO -- : message=Successful response code=200


"Heartbear timestamp 1733945890642513627"

I, [2024-12-11T13:38:10.665746 #16507]  INFO -- : message=Successful response code=200


"Chroma server version 0.5.23"

## Collections operations

Collection operations for Chroma

In [3]:
# Clean up collections
collections = Chroma::Resources::Collection.list
collections.each { |collection| Chroma::Resources::Collection.delete(collection.name) }

# Confirm that database has no collections
collections = Chroma::Resources::Collection.list

collection_name = "ruby-3.0"

IRuby.display "Collections in database #{collections.size}"

# Create a new collection
collection = Chroma::Resources::Collection.create(collection_name, {lang: "ruby", gem: "chroma-rb"})

IRuby.display collection

# Confirm that database has collections
collections = Chroma::Resources::Collection.list

IRuby.display "Collections in database #{collections.size}"

# Delete collection
Chroma::Resources::Collection.delete(collection_name)

# Re-Confirm that database has no collections
collections = Chroma::Resources::Collection.list

IRuby.display "Collections in database #{collections.size}"

# Create the collection again
Chroma::Resources::Collection.create(collection_name, {lang: "ruby", gem: "chroma-rb"})

# Get the collection from database
collection = Chroma::Resources::Collection.get(collection_name)
IRuby.display collection

# Modify collection name
new_collection_name = "ruby-3.2"
collection.modify(new_collection_name)

# Get modified collection from database
collection = Chroma::Resources::Collection.get(new_collection_name)
IRuby.display collection

I, [2024-12-11T13:38:20.846987 #16507]  INFO -- : message=Successful response code=200
I, [2024-12-11T13:38:20.881005 #16507]  INFO -- : message=Successful response code=200
I, [2024-12-11T13:38:20.884962 #16507]  INFO -- : message=Successful response code=200


"Collections in database 0"

I, [2024-12-11T13:38:20.917927 #16507]  INFO -- : message=Successful response code=200


#<Chroma::Resources::Collection:0x0000000105d0c220 @id="39233aa7-16cb-46a5-84ce-fbcca96cf4af", @name="ruby-3.0", @metadata={"lang"=>"ruby", "gem"=>"chroma-rb"}>

I, [2024-12-11T13:38:20.922331 #16507]  INFO -- : message=Successful response code=200


"Collections in database 1"

I, [2024-12-11T13:38:20.932971 #16507]  INFO -- : message=Successful response code=200
I, [2024-12-11T13:38:20.938969 #16507]  INFO -- : message=Successful response code=200


"Collections in database 0"

I, [2024-12-11T13:38:20.957134 #16507]  INFO -- : message=Successful response code=200
I, [2024-12-11T13:38:20.960461 #16507]  INFO -- : message=Successful response code=200


#<Chroma::Resources::Collection:0x00000001056ba728 @id="dc767e25-55b9-424f-92b1-9bf5aa9841d1", @name="ruby-3.0", @metadata={"gem"=>"chroma-rb", "lang"=>"ruby"}>

I, [2024-12-11T13:38:20.969917 #16507]  INFO -- : message=Successful response code=200
I, [2024-12-11T13:38:20.974476 #16507]  INFO -- : message=Successful response code=200


#<Chroma::Resources::Collection:0x0000000105533fa8 @id="dc767e25-55b9-424f-92b1-9bf5aa9841d1", @name="ruby-3.2", @metadata={"gem"=>"chroma-rb", "lang"=>"ruby"}>

### Naive Ruby helper methods

In [4]:
class Document
  attr_reader :content, :metadata
  
  def initialize(content, metadata = {})
    @content = content
    @metadata = metadata
  end
end

class TextLoader
  
  
  def initialize(file)
    @file = file
  end
  
  def load
    [Document.new(File.read(@file), {source: @file})]
  end
end

class RecursiveWordTextSplitter
  def initialize(chunk_size, chunk_overlap)
    @chunk_size = chunk_size
    @chunk_overlap = chunk_overlap
  end

  def split_documents(documents)
    original_documents = Array(documents)
    
    new_documents = []
    original_documents.each do |document|
      texts = split_text(document.content)
      puts texts.size
      texts.each do |text|
        new_documents << Document.new(text, document.metadata)
      end
    end
    
    new_documents
  end
  
  def split_text(text)
    split_recursive(text, 0, [])
  end

  private

  def split_recursive(text, start_index, chunks)
    # Base case: If the remaining word count is less than the chunk size, return the chunks
    if start_index + @chunk_size > text.length
      chunks << text[start_index..-1]
      return chunks
    end

    # Calculate the end index of the current chunk
    end_index = start_index + @chunk_size

    # Add the current chunk to the array
    chunk = text[start_index...end_index]
    position = detect_last_whitespace_or_line_return_position(chunk)
    chunks << chunk[0..position]&.strip

    # Calculate the next start index with overlap
    next_start_index = end_index - @chunk_overlap

    # Recursively split the remaining words
    split_recursive(text, next_start_index, chunks)
  end
  
  def detect_last_whitespace_or_line_return_position(string)
    position = string.rindex(/\s|\n/)
    position.nil? ? -1 : position
  end
end

def run_system(command)
  stdin, stdout, stderr, wait_thr = Open3.popen3(command)
  stdout_data = stdout.gets(nil)
  stdout.close
  stderr_data = stderr.gets(nil)
  stderr.close
  exit_code = wait_thr.value
  
  [stdout_data, stderr_data, exit_code]
end

:run_system

## Transform texts

Using the naive Ruby classes transforms documents into chunks

In [5]:
documents = TextLoader.new("ruby.txt").load
documents.size

1

In [6]:
text_splitter = RecursiveWordTextSplitter.new(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
texts.size

2


2

In [7]:
texts[0]

#<#<Class:0x0000000100a5a798>::Document:0x00000001052787f0 @content="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3) # => false\nRelated: Enumerable#any?\n\n\nArray#map Array#map!\n\nmap {|element| ... 

## Using Ollama and Nomic embed text

Ollama is a popular tool for executing LLM and embedding models on your local computer. Its installation process is straightforward, and its usage is intuitive.

To set up Ollama, begin by [downloading](https://ollama.com) and installing the binary from the official website. Once the installation is complete, execute the following command to download the `nomic-embed-text` model:

```
ollama pull nomic-embed-text
```

The Ollama comes with a REST api enpoint to use the models, here is a simple Ruby client that allow us to create embeddings. 

In [8]:
class OllamaHttpClient
  def initialize(url: "http://localhost:11434")
    @url = url
  end

  def embed(prompt, model: "nomic-embed-text")
    payload = {
      "model" => model,
      "prompt" => prompt
    }.to_json

    uri = URI("#{@url}/api/embeddings")
    request = Net::HTTP::Post.new(uri)
    request["Content-Type"] = "application/json"
    request.body = payload

    response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https') do |http|
      http.request(request)
    end

    response.code == "200" ? JSON.parse(response.body) : {}
  end
end

:embed

## Embeddings in collection

With this client you can create embeddings for each text chunk and store generated embeddings in a Chroma collection.

In [9]:
ollama_client = OllamaHttpClient.new

embeddings = texts.map do |document|
  embeddings_response = ollama_client.embed(document.content)
  Chroma::Resources::Embedding.new(id: SecureRandom.uuid, embedding: embeddings_response["embedding"], metadata: document.metadata, document: document.content)
end

IRuby.display embeddings

[#<Chroma::Resources::Embedding:0x0000000105e0fdc0 @id="6d666531-5fbf-46e0-b453-dcf3951c6a71", @embedding=[0.014701676554977894, 0.255524218082428, -2.129739761352539, -0.35414427518844604, 1.5835754871368408, -1.4551807641983032, -0.4776037633419037, 0.6955747604370117, 0.44284117221832275, 0.6864050030708313, -0.5362569093704224, 1.2758055925369263, 1.1534637212753296, 1.2279975414276123, -0.6829479932785034, -0.04986659809947014, -1.466883897781372, -0.7426703572273254, 1.2589280605316162, -0.14932145178318024, 0.38313260674476624, 0.03753571957349777, 0.2980467975139618, 0.31508868932724, 1.8711092472076416, 0.3789035677909851, 0.7525064945220947, 0.01412661001086235, -0.9818822741508484, -0.3709792196750641, -0.1707128882408142, -0.08575621992349625, 0.8792298436164856, -1.595631718635559, -0.28198331594467163, -0.5170110464096069, 1.1986714601516724, 0.6349908113479614, 0.7728958129882812, -0.07641415297985077, 0.2799926996231079, 0.8540084362030029, -0.7315670251846313, -0.25486

In [10]:
collection.add(embeddings)

I, [2024-12-11T13:38:40.349321 #16507]  INFO -- : message=Successful response code=201


true

In [11]:
IRuby.display collection.count

I, [2024-12-11T13:38:42.259923 #16507]  INFO -- : message=Successful response code=200


2

### Vector Store naive implementation

In [12]:
class VectorStore
  def initialize(store, search_type = "similarity")
    @store = store
    @search_type
    @ollama_client = OllamaHttpClient.new
  end
  
  def relevant_documents(query)
    if @search_type == "similarity"
      @store.similarity_search(query)
    end
  end
  
  protected
  
  def text_to_embeddings(query)
    response = @ollama_client.embed(query)
    [response["embedding"]]
  end
end

class ChromaVectorStore < VectorStore
  def similarity_search(query, k: 4, filter: nil)
    query_embeddings = text_to_embeddings(query)
    
    @store.query(query_embeddings:, results: k, where: filter)
  end
end

:similarity_search

### Search for similiarity

In [13]:
vs = ChromaVectorStore.new(collection)
embeddings =  vs.similarity_search("array any?", k: 2)

I, [2024-12-11T13:38:46.660678 #16507]  INFO -- : message=Successful response code=200


[#<Chroma::Resources::Embedding:0x0000000105dc1198 @id="6d666531-5fbf-46e0-b453-dcf3951c6a71", @embedding=nil, @metadata={"source"=>"ruby.txt"}, @document="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3

In [14]:
embeddings[0]

#<Chroma::Resources::Embedding:0x0000000105dc1198 @id="6d666531-5fbf-46e0-b453-dcf3951c6a71", @embedding=nil, @metadata={"source"=>"ruby.txt"}, @document="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3)