# Chroma Ruby Gem

## Database

Database operations for Chroma

In [1]:
require "bundler/inline"

gemfile do
  gem "chroma-db", path: "../"
end

""

In [2]:
require "logger"
require "json"
require "securerandom"

# Requiere Chroma Ruby client.
#require "chroma-db"

# Configure Chroma's host. Here you can specify your own host.
Chroma.connect_host = "http://localhost:8000"
Chroma.logger = Logger.new($stdout)
Chroma.log_level = Chroma::LEVEL_INFO

# Check connection with Database's heartbeat
response = Chroma::Resources::Database.heartbeat

IRuby.display "Heartbear timestamp #{response["nanosecond heartbeat"]}"

# Check current Chrome server version
version = Chroma::Resources::Database.version

IRuby.display "Chroma server version #{version}"

# Reset database (DANGER: This deletes all previos data)
# Chroma::Resources::Database.reset

I, [2024-04-13T15:03:57.935833 #35044]  INFO -- : message=Successful response code=200


"Heartbear timestamp 1713042237898295676"

I, [2024-04-13T15:03:57.945292 #35044]  INFO -- : message=Successful response code=200


"Chroma server version 0.4.24"

## Collections operations

Collection operations for Chroma

In [6]:
# Clean up collections
collections = Chroma::Resources::Collection.list
collections.each { |collection| Chroma::Resources::Collection.delete(collection.name) }

# Confirm that database has no collections
collections = Chroma::Resources::Collection.list

collection_name = "ruby-3.0"

IRuby.display "Collections in database #{collections.size}"

# Create a new collection
collection = Chroma::Resources::Collection.create(collection_name, {lang: "ruby", gem: "chroma-rb"})

IRuby.display collection

# Confirm that database has collections
collections = Chroma::Resources::Collection.list

IRuby.display "Collections in database #{collections.size}"

# Delete collection
Chroma::Resources::Collection.delete(collection_name)

# Re-Confirm that database has no collections
collections = Chroma::Resources::Collection.list

IRuby.display "Collections in database #{collections.size}"

# Create the collection again
Chroma::Resources::Collection.create(collection_name, {lang: "ruby", gem: "chroma-rb"})

# Get the collection from database
collection = Chroma::Resources::Collection.get(collection_name)
IRuby.display collection

# Modify collection name
new_collection_name = "ruby-3.2"
collection.modify(new_collection_name)

# Get modified collection from database
collection = Chroma::Resources::Collection.get(new_collection_name)
IRuby.display collection

I, [2024-04-13T15:08:12.755371 #35044]  INFO -- : message=Successful response code=200
I, [2024-04-13T15:08:12.775993 #35044]  INFO -- : message=Successful response code=200
I, [2024-04-13T15:08:12.783116 #35044]  INFO -- : message=Successful response code=200


"Collections in database 0"

I, [2024-04-13T15:08:12.797436 #35044]  INFO -- : message=Successful response code=200


#<Chroma::Resources::Collection:0x0000000104e91150 @id="6c20a0eb-6735-4c17-98e2-c67d181c61da", @name="ruby-3.0", @metadata={"lang"=>"ruby", "gem"=>"chroma-rb"}>

I, [2024-04-13T15:08:12.803337 #35044]  INFO -- : message=Successful response code=200


"Collections in database 1"

I, [2024-04-13T15:08:12.813811 #35044]  INFO -- : message=Successful response code=200
I, [2024-04-13T15:08:12.818497 #35044]  INFO -- : message=Successful response code=200


"Collections in database 0"

I, [2024-04-13T15:08:12.830041 #35044]  INFO -- : message=Successful response code=200
I, [2024-04-13T15:08:12.833464 #35044]  INFO -- : message=Successful response code=200


#<Chroma::Resources::Collection:0x0000000104e1d840 @id="86ff71c0-47a4-40df-90f9-3a34fcaf888a", @name="ruby-3.0", @metadata={"gem"=>"chroma-rb", "lang"=>"ruby"}>

I, [2024-04-13T15:08:12.838677 #35044]  INFO -- : message=Successful response code=200
I, [2024-04-13T15:08:12.843742 #35044]  INFO -- : message=Successful response code=200


#<Chroma::Resources::Collection:0x0000000104e105a0 @id="86ff71c0-47a4-40df-90f9-3a34fcaf888a", @name="ruby-3.2", @metadata={"gem"=>"chroma-rb", "lang"=>"ruby"}>

### Naive Ruby helper methods

In [7]:
class Document
  attr_reader :content, :metadata
  
  def initialize(content, metadata = {})
    @content = content
    @metadata = metadata
  end
end

class TextLoader
  
  
  def initialize(file)
    @file = file
  end
  
  def load
    [Document.new(File.read(@file), {source: @file})]
  end
end

class RecursiveWordTextSplitter
  def initialize(chunk_size, chunk_overlap)
    @chunk_size = chunk_size
    @chunk_overlap = chunk_overlap
  end

  def split_documents(documents)
    original_documents = Array(documents)
    
    new_documents = []
    original_documents.each do |document|
      texts = split_text(document.content)
      puts texts.size
      texts.each do |text|
        new_documents << Document.new(text, document.metadata)
      end
    end
    
    new_documents
  end
  
  def split_text(text)
    split_recursive(text, 0, [])
  end

  private

  def split_recursive(text, start_index, chunks)
    # Base case: If the remaining word count is less than the chunk size, return the chunks
    if start_index + @chunk_size > text.length
      chunks << text[start_index..-1]
      return chunks
    end

    # Calculate the end index of the current chunk
    end_index = start_index + @chunk_size

    # Add the current chunk to the array
    chunk = text[start_index...end_index]
    position = detect_last_whitespace_or_line_return_position(chunk)
    chunks << chunk[0..position]&.strip

    # Calculate the next start index with overlap
    next_start_index = end_index - @chunk_overlap

    # Recursively split the remaining words
    split_recursive(text, next_start_index, chunks)
  end
  
  def detect_last_whitespace_or_line_return_position(string)
    position = string.rindex(/\s|\n/)
    position.nil? ? -1 : position
  end
end

def run_system(command)
  stdin, stdout, stderr, wait_thr = Open3.popen3(command)
  stdout_data = stdout.gets(nil)
  stdout.close
  stderr_data = stderr.gets(nil)
  stderr.close
  exit_code = wait_thr.value
  
  [stdout_data, stderr_data, exit_code]
end

:run_system

## Transform texts

Using the naive Ruby classes transforms documents into chunks

In [8]:
documents = TextLoader.new("ruby.txt").load
documents.size

1

In [9]:
text_splitter = RecursiveWordTextSplitter.new(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
texts.size

2


2

In [10]:
texts[0]

#<#<Class:0x00000001047d8660>::Document:0x000000010498a8c8 @content="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3) # => false\nRelated: Enumerable#any?\n\n\nArray#map Array#map!\n\nmap {|element| ... 

## Using Ollama and Nomic embed text

Ollama is a popular tool for executing LLM and embedding models on your local computer. Its installation process is straightforward, and its usage is intuitive.

To set up Ollama, begin by [downloading](https://ollama.com) and installing the binary from the official website. Once the installation is complete, execute the following command to download the `nomic-embed-text` model:

```
ollama pull nomic-embed-text
```

The Ollama comes with a REST api enpoint to use the models, here is a simple Ruby client that allow us to create embeddings. 

In [13]:
class OllamaHttpClient
  def initialize(url: "http://localhost:11434")
    @url = url
  end

  def embed(prompt, model: "nomic-embed-text")
    payload = {
      "model" => model,
      "prompt" => prompt
    }.to_json

    uri = URI("#{@url}/api/embeddings")
    request = Net::HTTP::Post.new(uri)
    request["Content-Type"] = "application/json"
    request.body = payload

    response = Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https') do |http|
      http.request(request)
    end

    response.code == "200" ? JSON.parse(response.body) : {}
  end
end

:embed

## Embeddings in collection

With this client you can create embeddings for each text chunk and store generated embeddings in a Chroma collection.

In [14]:
ollama_client = OllamaHttpClient.new

embeddings = texts.map do |document|
  embeddings_response = ollama_client.embed(document.content)
  Chroma::Resources::Embedding.new(id: SecureRandom.uuid, embedding: embeddings_response["embedding"], metadata: document.metadata, document: document.content)
end

IRuby.display embeddings

[#<Chroma::Resources::Embedding:0x0000000104cf48d8 @id="408af717-0c7e-48d5-a3f8-567a49f35c93", @embedding=[0.3654118478298187, 0.28818851709365845, -1.6767185926437378, -0.24342317879199982, 1.4833393096923828, -1.3290095329284668, -0.3449559807777405, 0.8983354568481445, 0.4381735026836395, 0.778736412525177, -0.6099660396575928, 0.8626308441162109, 1.2367924451828003, 1.327372431755066, -0.684183657169342, -0.2865280508995056, -1.858189582824707, -0.8947423696517944, 1.1978974342346191, -0.21722902357578278, 0.6811590790748596, 0.031737420707941055, 0.48078030347824097, 0.119293712079525, 1.0430091619491577, 0.5045174360275269, 0.5337985754013062, -0.05444779619574547, -0.9516913294792175, -0.35378336906433105, -0.19843631982803345, -0.0872187688946724, 0.8603364825248718, -1.372267246246338, -0.09822799265384674, -0.38536396622657776, 1.3114744424819946, 0.5037254691123962, 0.46312376856803894, 0.1805170178413391, 0.30307474732398987, 0.7923384308815002, -0.6662959456443787, -0.3526

In [15]:
collection.add(embeddings)

I, [2024-04-13T15:09:05.120223 #35044]  INFO -- : message=Successful response code=201


true

In [16]:
IRuby.display collection.count

I, [2024-04-13T15:09:08.590406 #35044]  INFO -- : message=Successful response code=200


2

### Vector Store naive implementation

In [17]:
class VectorStore
  def initialize(store, search_type = "similarity")
    @store = store
    @search_type
    @ollama_client = OllamaHttpClient.new
  end
  
  def relevant_documents(query)
    if @search_type == "similarity"
      @store.similarity_search(query)
    end
  end
  
  protected
  
  def text_to_embeddings(query)
    response = @ollama_client.embed(query)
    [response["embedding"]]
  end
end

class ChromaVectorStore < VectorStore
  def similarity_search(query, k: 4, filter: {})
    query_embeddings = text_to_embeddings(query)
    
    @store.query(query_embeddings:, results: k, where: filter)
  end
end

:similarity_search

### Search for similiarity

In [18]:
vs = ChromaVectorStore.new(collection)
embeddings =  vs.similarity_search("array any?", k: 2)

I, [2024-04-13T15:09:16.758519 #35044]  INFO -- : message=Successful response code=200


[#<Chroma::Resources::Embedding:0x000000010491b590 @id="408af717-0c7e-48d5-a3f8-567a49f35c93", @embedding=nil, @metadata={"source"=>"ruby.txt"}, @document="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3

In [19]:
embeddings[0]

#<Chroma::Resources::Embedding:0x000000010491b590 @id="408af717-0c7e-48d5-a3f8-567a49f35c93", @embedding=nil, @metadata={"source"=>"ruby.txt"}, @document="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3)