In [66]:
import chromadb
import pandas as pd
from langchain.embeddings import GPT4AllEmbeddings

gpt4all_embd = GPT4AllEmbeddings()

client_chroma = chromadb.PersistentClient(path="db/")

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [136]:
import os
import psycopg2

# Get the database credentials from environment variables
host = os.getenv("DB_HOST")
port = os.getenv("DB_PORT")
database = os.getenv("DB_NAME")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")

# Connect to the PostgreSQL database
connection = psycopg2.connect(
    host=host,
    port=port,
    database=database,
    user=user,
    password=password
)

cursor = connection.cursor()

SQL_avg_rating = "SELECT movieId, SUM(RATING)/COUNT(*) as AVG_RATING FROM ratings GROUP BY movieId"

SQL_tags_cat = "SELECT movieId, STRING_AGG(tag, ' ') FROM tags GROUP BY movieId"

SQL_movies = "SELECT movieId as mainMovieId, title, genres FROM movies"

QUERY = f"""
SELECT mvTcat.mainMovieId, mvTcat.title, mvTcat.genres, rate.AVG_RATING, mvTcat.STRING_AGG
FROM
(
    ({SQL_movies}) mv
    LEFT JOIN ({SQL_tags_cat}) tcat
    ON mv.mainMovieId = tcat.movieId
) mvTcat
LEFT JOIN ({SQL_avg_rating}) rate
ON mvTcat.mainMovieId = rate.movieId;
"""

cursor.execute(QUERY)
response = cursor.fetchall()
print(len(response))
response


# item_to_be_saved = {title} {genre} {rating} {tag1} {tag2}....{tagN}

9742


[(1,
  'Toy Story (1995)',
  'Adventure|Animation|Children|Comedy|Fantasy',
  3.9209302325581397,
  'pixar fun pixar'),
 (2,
  'Jumanji (1995)',
  'Adventure|Children|Fantasy',
  3.4318181818181817,
  'game fantasy magic board game Robin Williams'),
 (3,
  'Grumpier Old Men (1995)',
  'Comedy|Romance',
  3.2596153846153846,
  'moldy old'),
 (4,
  'Waiting to Exhale (1995)',
  'Comedy|Drama|Romance',
  2.357142857142857,
  None),
 (5,
  'Father of the Bride Part II (1995)',
  'Comedy',
  3.0714285714285716,
  'remake pregnancy'),
 (6, 'Heat (1995)', 'Action|Crime|Thriller', 3.946078431372549, None),
 (7, 'Sabrina (1995)', 'Comedy|Romance', 3.185185185185185, 'remake'),
 (8, 'Tom and Huck (1995)', 'Adventure|Children', 2.875, None),
 (9, 'Sudden Death (1995)', 'Action', 3.125, None),
 (10,
  'GoldenEye (1995)',
  'Action|Adventure|Thriller',
  3.496212121212121,
  None),
 (11,
  'American President, The (1995)',
  'Comedy|Drama|Romance',
  3.6714285714285713,
  'politics president'),
 (1

In [137]:
bak = response.copy()

In [148]:
final_data = list(map(lambda x: [x[0], " ".join([str(k) for k in list(x)[1:] if k is not None])], bak))

In [158]:
client_chroma.delete_collection('movies')

In [144]:
arr = [[1, "cat"], [2, "dog"], [3, "bird"]]

# iterate the array and insert the second item (string) as the document, and the first item (int) as the id
for id, data in arr:
    print(id, data)

1 cat
2 dog
3 bird


In [159]:
from tqdm import tqdm

for id, data in tqdm(final_data):
    doc_result = gpt4all_embd.embed_query(data)

    # Save the embedded document to Chroma
    collection = client_chroma.get_or_create_collection('movies')

    collection.add(
        embeddings = [doc_result],
        documents = [data],
        metadatas = [{"source": "batch"}],
        ids = [str(id)]
    )

100%|██████████| 9742/9742 [02:11<00:00, 73.85it/s]


In [164]:
collection = client_chroma.get_collection('movies')
data = "recommend me movies where Leonardo DiCaprio is the main actor"

collection.query(
    query_texts=data,
    n_results=4
)

{'ids': [['44849', '79132', '99114', '108689']],
 'distances': [[1.022019863128662,
   1.0526598691940308,
   1.0667401552200317,
   1.133256196975708]],
 'metadatas': [[{'source': 'batch'},
   {'source': 'batch'},
   {'source': 'batch'},
   {'source': 'batch'}]],
 'embeddings': None,
 'documents': [['Renaissance (2006) Action|Animation|Film-Noir|Sci-Fi|Thriller 2.5',
   'Inception (2010) Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX 4.066433566433567 visually appealing alternate reality Leonardo DiCaprio sci-fi thought-provoking dreamlike action philosophy thought-provoking cerebral big budget clever complicated dead wife great soundtrack heist intellectual mindfuck philosophy psychological psychology suspense thought-provoking visually appealing surreal dreamlike',
   'Django Unchained (2012) Action|Drama|Western 3.943661971830986 Humour Leonardo DiCaprio Quentin Tarantino Samuel L. Jackson Soundtrack good soundtrack funny Christoph Waltz western Soundtrack Great performances actio