# Chroma Ruby Gem

## Database

Database operations for Chroma

In [None]:
require "logger"
require "json"
require "securerandom"
require "open3"

# Requiere Chroma Ruby client.
require "chroma-db"

# Configure Chroma's host. Here you can specify your own host.
Chroma.connect_host = "http://localhost:8000"
Chroma.logger = Logger.new($stdout)
Chroma.log_level = Chroma::LEVEL_ERROR

# Check connection with Database's heartbeat
response = Chroma::Resources::Database.heartbeat

IRuby.display "Heartbear timestamp #{response["nanosecond heartbeat"]}"

# Check current Chrome server version
version = Chroma::Resources::Database.version

IRuby.display "Chrome server version #{version}"

# Reset database (DANGER: This deletes all previos data)
Chroma::Resources::Database.reset

## Collections operations

Collection operations for Chroma

In [2]:
# Confirm that database has no collections
collections = Chroma::Resources::Collection.list

collection_name = "ruby-3.0"

IRuby.display "Collections in database #{collections.size}"

# Create a new collection
collection = Chroma::Resources::Collection.create(collection_name, {lang: "ruby", gem: "chroma-rb"})

IRuby.display collection

# Confirm that database has no collections
collections = Chroma::Resources::Collection.list

IRuby.display "Collections in database #{collections.size}"

# Delete collection
Chroma::Resources::Collection.delete(collection_name)

# Re-Confirm that database has no collections
collections = Chroma::Resources::Collection.list

IRuby.display "Collections in database #{collections.size}"

# Create the collection again
Chroma::Resources::Collection.create(collection_name, {lang: "ruby", gem: "chroma-rb"})

# Get the collection from database
collection = Chroma::Resources::Collection.get(collection_name)
IRuby.display collection

# Modify collection name
new_collection_name = "ruby-3.2"
collection.modify(new_collection_name)

# Get modified collection from database
collection = Chroma::Resources::Collection.get(new_collection_name)
IRuby.display collection

"Collections in database 0"

#<Chroma::Resources::Collection:0x000000010551a878 @name="ruby-3.0", @metadata={"lang"=>"ruby", "gem"=>"chroma-rb"}>

"Collections in database 1"

"Collections in database 0"

#<Chroma::Resources::Collection:0x0000000104d98190 @name="ruby-3.0", @metadata={"lang"=>"ruby", "gem"=>"chroma-rb"}>

#<Chroma::Resources::Collection:0x0000000105577078 @name="ruby-3.2", @metadata={"lang"=>"ruby", "gem"=>"chroma-rb"}>

### Naive Ruby helper methods

In [3]:
class Document
  attr_reader :content, :metadata
  
  def initialize(content, metadata = {})
    @content = content
    @metadata = metadata
  end
end

class TextLoader
  
  
  def initialize(file)
    @file = file
  end
  
  def load
    [Document.new(File.read(@file), {source: @file})]
  end
end

class RecursiveWordTextSplitter
  def initialize(chunk_size, chunk_overlap)
    @chunk_size = chunk_size
    @chunk_overlap = chunk_overlap
  end

  def split_documents(documents)
    original_documents = Array(documents)
    
    new_documents = []
    original_documents.each do |document|
      texts = split_text(document.content)
      puts texts.size
      texts.each do |text|
        new_documents << Document.new(text, document.metadata)
      end
    end
    
    new_documents
  end
  
  def split_text(text)
    split_recursive(text, 0, [])
  end

  private

  def split_recursive(text, start_index, chunks)
    # Base case: If the remaining word count is less than the chunk size, return the chunks
    if start_index + @chunk_size > text.length
      chunks << text[start_index..-1]
      return chunks
    end

    # Calculate the end index of the current chunk
    end_index = start_index + @chunk_size

    # Add the current chunk to the array
    chunk = text[start_index...end_index]
    position = detect_last_whitespace_or_line_return_position(chunk)
    chunks << chunk[0..position]&.strip

    # Calculate the next start index with overlap
    next_start_index = end_index - @chunk_overlap

    # Recursively split the remaining words
    split_recursive(text, next_start_index, chunks)
  end
  
  def detect_last_whitespace_or_line_return_position(string)
    position = string.rindex(/\s|\n/)
    position.nil? ? -1 : position
  end
end

def run_system(command)
  stdin, stdout, stderr, wait_thr = Open3.popen3(command)
  stdout_data = stdout.gets(nil)
  stdout.close
  stderr_data = stderr.gets(nil)
  stderr.close
  exit_code = wait_thr.value
  
  [stdout_data, stderr_data, exit_code]
end

:run_system

## Transform texts

Using the naive Ruby classes transforms documents into chunks

In [4]:
documents = TextLoader.new("ruby.txt").load
documents.size

1

In [5]:
text_splitter = RecursiveWordTextSplitter.new(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)
texts.size

2


2

In [6]:
texts[0]

#<#<Class:0x0000000104f880e0>::Document:0x00000001055db370 @content="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3) # => false\nRelated: Enumerable#any?\n\n\nArray#map Array#map!\n\nmap {|element| ... 

**Prepare content**

In [7]:
contents = texts.map(&:content)
IRuby.display contents

["Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3) # => false\nRelated: Enumerable#any?\n\n\nArray#map Array#map!\n\nmap {|element| ... } → new_array\nmap → new_enumerator\nCalls the block, if given, wit

In [8]:
metadatas = texts.map(&:metadata)
IRuby.display metadatas

[{:source=>"ruby.txt"}, {:source=>"ruby.txt"}]

In [9]:
File.open("documents.json","w") do |f|
  f.write(contents.to_json)
end

2060

## Python glue

We need Python glue to create text embeddings.

I first try to use Ruby's gem **Pycall** to use HuggingFace's embeddings but I couldn't make it to work. The commented code shows what I was trying to do here and also shows the error.

I opted to run Python code as system command and capture the output to bring it back to Ruby.

Before you need the following Python libraries installed in your system.

In [10]:
stdout_data, stderr_data, exit_code = run_system("pip -q install langchain sentence_transformers InstructorEmbedding")

IRuby.display stdout_data
IRuby.display stderr_data
IRuby.display exit_code

#<Process::Status: pid 3263 exit 0>

**WARNING**: The following code might take a long time to run the first time, since it downloads and install HuggingFace models before creating embeddings for our texts.

In [14]:
# require "pycall"
# require "pycall/import"
# include PyCall::Import

# pyimport "InstructorEmbedding" import INSTRUCTOR
# pyimport "langchain.embeddings", as: "embeddings"

# from InstructorEmbedding import INSTRUCTOR
# from langchain.embeddings import HuggingFaceInstructEmbeddings

# instructor_embeddings = embeddings.HuggingFaceInstructEmbeddings.new(
#  model_name: "hkunlp/instructor-xl",
#  model_kwargs: { "device" => "cpu" }  # Use cuda as value if you have a GPU.
# )
#
# PyCall::LibPythonFunctionNotFound: Unable to find the required symbol in libpython: _Py_NoneStruct

command = <<~PYTHON
python - << EOF
import json
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

from langchain.embeddings import HuggingFaceInstructEmbeddings
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cpu"})

with open("documents.json") as f:
    file_content = f.read()

documents = json.loads(file_content)
    
embeddings = instructor_embeddings.embed_documents(list(documents))

with open("embeddings.json", "w", encoding="utf-8") as file:
    json.dump(embeddings, file, ensure_ascii=False, indent=4)
    
print("Embeddings at embeddings.json")
EOF
PYTHON

stdout_data, stderr_data, exit_code = run_system(command)

IRuby.display stdout_data
IRuby.display stderr_data
IRuby.display exit_code


"load INSTRUCTOR_Transformer\nmax_seq_length  512\nEmbeddings at embeddings.json\n"

#<Process::Status: pid 3304 exit 0>

## Embeddings in collection

Embedding operations for Chroma

In [15]:
documents_embeddings = JSON.load(File.read("embeddings.json"))

[[0.03228979930281639, 0.04374200478196144, 0.014979278668761253, -0.008304460905492306, -0.0036412617191672325, -0.04301042854785919, -0.08898357301950455, 0.010292756371200085, -0.04601152241230011, -0.01627073809504509, 0.0423235185444355, 0.035671770572662354, -0.03516373038291931, -0.10667434334754944, -0.0305192731320858, -0.00317957391962409, 0.00898097362369299, -0.04044022411108017, -0.03859616443514824, -2.847463656507898e-05, -0.02551359124481678, 0.015683457255363464, -0.03809453547000885, 0.026084054261446, 0.004630913957953453, -0.05073123052716255, -0.005383108276873827, 0.00900851096957922, -0.022124305367469788, 0.0008553847437724471, 0.03406482934951782, 0.006923138629645109, 0.016204921528697014, -0.0248075183480978, 0.020003201439976692, 0.015132367610931396, -0.01962454617023468, -0.03499720245599747, 0.01819889433681965, -0.0026789242401719093, -0.06468355655670166, 0.023844322189688683, 0.012745407409965992, -0.03798774629831314, 0.03615935146808624, 0.0703736022

In [16]:
embeddings = texts.map.with_index do |text, index|
  Chroma::Resources::Embedding.new(id: SecureRandom.uuid, embedding: documents_embeddings[index], metadata: metadatas[index], document: text.content) 
end

IRuby.display embeddings

[#<Chroma::Resources::Embedding:0x00000001049957f0 @id="92118736-5484-4da4-b669-cde1cc03c369", @embedding=[0.03228979930281639, 0.04374200478196144, 0.014979278668761253, -0.008304460905492306, -0.0036412617191672325, -0.04301042854785919, -0.08898357301950455, 0.010292756371200085, -0.04601152241230011, -0.01627073809504509, 0.0423235185444355, 0.035671770572662354, -0.03516373038291931, -0.10667434334754944, -0.0305192731320858, -0.00317957391962409, 0.00898097362369299, -0.04044022411108017, -0.03859616443514824, -2.847463656507898e-05, -0.02551359124481678, 0.015683457255363464, -0.03809453547000885, 0.026084054261446, 0.004630913957953453, -0.05073123052716255, -0.005383108276873827, 0.00900851096957922, -0.022124305367469788, 0.0008553847437724471, 0.03406482934951782, 0.006923138629645109, 0.016204921528697014, -0.0248075183480978, 0.020003201439976692, 0.015132367610931396, -0.01962454617023468, -0.03499720245599747, 0.01819889433681965, -0.0026789242401719093, -0.0646835565567

In [17]:
collection.add(embeddings)

true

In [18]:
IRuby.display collection.count

2

### Vector Store naive implementation

In [19]:
class VectorStore
  def initialize(store, search_type = "similarity")
    @store = store
    @search_type
  end
  
  def relevant_documents(query)
    if @search_type == "similarity"
      @store.similarity_search(query)
    end
  end
  
  protected
  
  def text_to_embeddings(query)
    command = <<~PYTHON
python - << EOF
import json
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

from langchain.embeddings import HuggingFaceInstructEmbeddings
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", 
                                                      model_kwargs={"device": "cpu"})
    
embeddings = instructor_embeddings.embed_documents(list(["#{query}"]))

print("========")
print(embeddings)
EOF
PYTHON

    stdout_data, stderr_data, exit_code = run_system(command)
    
    parse_output(stdout_data)
  end
  
  private

  def parse_output(data)
    delimiter = "========"
    parsed_text = data.split(delimiter).last.strip
    
    return nil if parsed_text.nil?
    JSON.parse(parsed_text)
  end
  
  def run_system(command)
    stdin, stdout, stderr, wait_thr = Open3.popen3(command)
    stdout_data = stdout.gets(nil)
    stdout.close
    stderr_data = stderr.gets(nil)
    stderr.close
    exit_code = wait_thr.value
  
    [stdout_data, stderr_data, exit_code]
  end
end

class ChromaVectorStore < VectorStore
  def similarity_search(query, k: 4, filter: {})
    query_embeddings = text_to_embeddings(query)
    
    @store.query(query_embeddings:, results: k, where: filter)
  end
end

:similarity_search

### Search for similiarity

In [20]:
vs = ChromaVectorStore.new(collection)
embeddings =  vs.similarity_search("array any?", k: 2)

[#<Chroma::Resources::Embedding:0x00000001046ac048 @id="92118736-5484-4da4-b669-cde1cc03c369", @embedding=nil, @metadata={"source"=>"ruby.txt"}, @document="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3

In [21]:
embeddings[0]

#<Chroma::Resources::Embedding:0x00000001046ac048 @id="92118736-5484-4da4-b669-cde1cc03c369", @embedding=nil, @metadata={"source"=>"ruby.txt"}, @document="Array#any?\n\nany? → true or false\nany? {|element| ... } → true or false\nany?(obj) → true or false\n\nReturns true if any element of self meets a given criterion.\n\nWith no block given and no argument, returns true if self has any truthy element, false otherwise:\n\n[nil, 0, false].any? # => true\n[nil, false].any? # => false\n[].any? # => false\nWith a block given and no argument, calls the block with each element in self; returns true if the block returns any truthy value, false otherwise:\n\n[0, 1, 2].any? {|element| element > 1 } # => true\n[0, 1, 2].any? {|element| element > 2 } # => false\nIf argument obj is given, returns true if obj.=== any element, false otherwise:\n\n['food', 'drink'].any?(/foo/) # => true\n['food', 'drink'].any?(/bar/) # => false\n[].any?(/foo/) # => false\n[0, 1, 2].any?(1) # => true\n[0, 1, 2].any?(3)