# Coding Settings 

In [1]:
import logging
import os
from datetime import timedelta
from functools import lru_cache
from typing import Optional

from dotenv import load_dotenv
from pydantic import BaseModel, Field

load_dotenv(dotenv_path="./.env")


def setup_logging():
    """Configure basic logging for the application."""
    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )


class LLMSettings(BaseModel):
    """Base settings for Language Model configurations."""

    temperature: float = 0.0
    max_tokens: Optional[int] = None
    max_retries: int = 3


class OpenAISettings(LLMSettings):
    """OpenAI-specific settings extending LLMSettings."""

    api_key: str = Field(default_factory=lambda: os.getenv("OPENAI_API_KEY"))
    default_model: str = Field(default="gpt-4o")
    embedding_model: str = Field(default="text-embedding-3-small")


class DatabaseSettings(BaseModel):
    """Database connection settings."""

    service_url: str = Field(default_factory=lambda: os.getenv("TIMESCALE_SERVICE_URL"))


class VectorStoreSettings(BaseModel):
    """Settings for the VectorStore."""

    table_name: str = "embeddings"
    embedding_dimensions: int = 1536
    time_partition_interval: timedelta = timedelta(days=7)


class Settings(BaseModel):
    """Main settings class combining all sub-settings."""

    openai: OpenAISettings = Field(default_factory=OpenAISettings)
    database: DatabaseSettings = Field(default_factory=DatabaseSettings)
    vector_store: VectorStoreSettings = Field(default_factory=VectorStoreSettings)


@lru_cache()
def get_settings() -> Settings:
    """Create and return a cached instance of the Settings."""
    settings = Settings()
    setup_logging()
    return settings

# Coding the Vectorstore from Scratch

In [2]:
from openai import OpenAI
from timescale_vector import client
import pandas as pd

def initialize_clients(settings):
	"""Initializing all necessary clients."""
	
	# Initializing VectorStore
	service_url = settings.database.service_url
	table_name = settings.vector_store.table_name
	embedding_dimensions = settings.vector_store.embedding_dimensions
	time_partition_interval = settings.vector_store.time_partition_interval

	# Creating vectorstore connection using timescale
	vec_client = client.Sync(
		service_url=service_url,
		table_name=table_name,
		num_dimensions=embedding_dimensions,
		time_partition_interval=time_partition_interval
	)

	# Initializing OpenAI Client
	openai_api_key = settings.openai.api_key
	openai_client = OpenAI(api_key=openai_api_key)

	# Initializing Embedding Model
	embeddding_model = settings.openai.embedding_model

	return vec_client, openai_client, embeddding_model


def get_embedding(openai_client, embeddding_model, text):
	"""Function to create embeddings."""

	# Removing \n with space
	text = text.replace('\n', ' ')
	# Creating Embedding
	embedding = (openai_client.embeddings.create(input=[text], model=embeddding_model).data[0].embedding)

	return embedding

def create_tables(vec_client):
	"""Function to create table."""

	# Creating table using vector client
	vec_client.create_tables()
	
def create_index(vec_client):
	"""Function to create embedding index."""

	# Creating Embedding Index using DiskAnnIndex
	vec_client.create_embedding_index(client.DiskAnnIndex())
	
def drop_index(vec_client):
	"""Function to drop embedding index."""

	# Dropping Embedding Index
	vec_client.drop_embedding_index()

def upsert(vec_client, df):
	"""Function to upload/insert dataframe with embeddings to table."""

	# Converting dadtaframe to records
	records = df.to_records(index=False)

	# Inserting the records to table
	vec_client.upsert(list(records))

def create_dataframe_from_results(results):
	"""Function to convert the queried results as dataframe."""
	
	# Creating dataframe from list of tuples
	results = pd.DataFrame(results, columns=['id', 'metadata', 'content', 'embedding', 'distance'])

	# Typecasting id column to string
	results['id'] = results['id'].astype(str)

	return results

def search(vec_client, query_text, limit, metadata_filter, predicates, time_range, return_dataframe):
	"""Function to search for Embeddings similar to input embedding."""
    
	# Converting input query to embedding
	query_embedding = get_embedding(query_text)

	# Creating Search arguments dictionary
	search_args = {
		'limit': limit
	}

	# Adding metadata filter to search arguments if available
	if metadata_filter:
		search_args['metadata_filter'] = metadata_filter

	# Adding predicates to search arguments if available
	if predicates:
		search_args['predicates'] = predicates

	# Adding time range to search arguments if available
	if time_range:
		start_date, end_date = time_range
		search_args['uuid_time_filter'] = client.UUIDTimeRange(start_date, end_date)

	# Searching for most similar embeddings
	results = vec_client.search(query_embedding, **search_args)

	# Returning dataframe with similar embeddings if set on
	if return_dataframe:
		return create_dataframe_from_results(results)
	else:
		return results
	
def delete_records(vec_client, ids=None, metadata_filter=None, delete_all=None):
	"""Function to remove records from vector database."""
	
	# Checking if multiple parameters are passed
	if sum(bool(x) for x in (ids, metadata_filter, delete_all)) != 1:
		raise ValueError("Provide exactly one from: ids, metadata_filter, or delete_all")
	
	# Deleting Records according to given conditions
	if delete_all:
		vec_client.delete_all()
	elif ids:
		vec_client.delete_by_ids(ids)
	elif metadata_filter:
		vec_client.delete_by_metadata(metadata_filter)

In [3]:
settings = get_settings()
vec_client, openai_client, embeddding_model = initialize_clients(settings)

In [4]:
delete_records(vec_client, delete_all=True)

# Coding Module to insert data to Vector Database

In [5]:
from datetime import datetime

import pandas as pd
# from database.vector_store import VectorStore
from timescale_vector.client import uuid_from_time

# Read the CSV file
df = pd.read_csv("data/faq_dataset.csv", sep=";")
df = df.head(5)

# Prepare data for insertion
def prepare_record(row):
    """Prepare a record for insertion into the vector store.

    This function creates a record with a UUID version 1 as the ID, which captures
    the current time or a specified time.

    Note:
        - By default, this function uses the current time for the UUID.
        - To use a specific time:
          1. Import the datetime module.
          2. Create a datetime object for your desired time.
          3. Use uuid_from_time(your_datetime) instead of uuid_from_time(datetime.now()).

        Example:
            from datetime import datetime
            specific_time = datetime(2023, 1, 1, 12, 0, 0)
            id = str(uuid_from_time(specific_time))

        This is useful when your content already has an associated datetime.
    """
    content = f"Question: {row['question']}\nAnswer: {row['answer']}"
    embedding = get_embedding(openai_client, embeddding_model, content)
    return pd.Series(
        {
            "id": str(uuid_from_time(datetime.now())),
            "metadata": {
                "category": row["category"],
                "created_at": datetime.now().isoformat(),
            },
            "contents": content,
            "embedding": embedding,
        }
    )


records_df = df.apply(prepare_record, axis=1)
# Create tables and insert data
create_tables(vec_client)
create_index(vec_client)  # DiskAnnIndex
upsert(vec_client, records_df)

2025-04-04 23:52:26,949 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-04 23:52:33,348 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-04 23:52:35,039 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-04 23:52:36,089 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-04 23:52:37,640 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


# Coding LLMHub

In [106]:
import instructor
from openai import OpenAI

from typing import List, Dict, Any, Optional, Tuple

In [25]:
from typing import List, Dict, Any, Type
import instructor
from openai import OpenAI
from pydantic import BaseModel

class LLMHub:
    
	def __init__(self, provider: str):
		"""Constructor to Initialize LLMHub."""

		self.provider = provider
		self.settings = getattr(get_settings(), self.provider)
		self.client = self._initialize_client()

	def _initialize_client(self) -> Any:
		"""Function to initialize client."""

		# Creating a dict of multiple LLM initializers
		client_initializers = {
			'openai': lambda x: instructor.from_openai(OpenAI(api_key=x.api_key))
		}

		# Initializing selected initializer
		initializer = client_initializers.get(self.provider)

		# Checking if initializer valid and returning client
		if initializer:
			return initializer(self.settings)
		else:
			raise ValueError("Selected Provider not Available.")
		
	def create_completion(self, response_model: Type[BaseModel], messages: List[Dict[str, str]], **kwargs) -> Any:
		"""Function to get completion from LLM for given input."""

		# Creating completion params from kwargs and inputs
		completion_params = {
			'model': kwargs.get('model', self.settings.default_model),
			'temperature': kwargs.get('temperature', self.settings.temperature),
			'max_retries': kwargs.get('max_retries', self.settings.max_retries),
			'max_tokens': kwargs.get('max_tokens', self.settings.max_tokens),
			'response_model': response_model,
			'messages': messages
		}

		# Creating completion
		return self.client.chat.completions.create(**completion_params)