### Baza wektorowa Qdrant

```
docker run -p 6333:6333 -p 6334:6334 \
    -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
    qdrant/qdrant
```
    

### Dane

In [3]:
import pandas as pd
# https://www.kaggle.com/datasets/harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows

df = pd.read_csv('/home/pawel/projects/systemy_inteligentne/data/imdb_top_1000.csv')
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
df['text'] = df.apply(lambda x: f"{x['Overview']}", axis=1)
filter_cols = ['Series_Title', 'Genre', 'IMDB_Rating', 'Released_Year', 'text']
df = df[filter_cols].copy()
df

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Released_Year,text
0,The Shawshank Redemption,Drama,9.3,1994,Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",9.2,1972,An organized crime dynasty's aging patriarch t...
2,The Dark Knight,"Action, Crime, Drama",9.0,2008,When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,"Crime, Drama",9.0,1974,The early life and career of Vito Corleone in ...
4,12 Angry Men,"Crime, Drama",9.0,1957,A jury holdout attempts to prevent a miscarria...
...,...,...,...,...,...
995,Breakfast at Tiffany's,"Comedy, Drama, Romance",7.6,1961,A young New York socialite becomes interested ...
996,Giant,"Drama, Western",7.6,1956,Sprawling epic covering the life of a Texas ca...
997,From Here to Eternity,"Drama, Romance, War",7.6,1953,"In Hawaii in 1941, a private is cruelly punish..."
998,Lifeboat,"Drama, War",7.6,1944,Several survivors of a torpedoed merchant ship...


### Wektoryzacja

In [5]:
from sentence_transformers import SentenceTransformer, util

2025-04-14 18:30:58.821948: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-14 18:30:58.935203: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-14 18:30:58.969158: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-14 18:30:59.162242: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')



In [None]:
# quantization test
model.encode(df['text'].tolist(), convert_to_tensor=False, precision='binary')

array([[ -58,   54,   -8, ...,   95,   58,  104],
       [ -50,  -24, -126, ...,   79,  126,  -54],
       [  95,  -25,  114, ...,   72, -102,  -50],
       ...,
       [ -34,  -18, -107, ...,  -17,   50,   80],
       [ -34,  -84, -124, ...,   -1,  -90,  101],
       [ -58,  -80,   74, ...,   30,   39,   20]], dtype=int8)

In [7]:
vectors = model.encode(df['text'].tolist(), convert_to_tensor=False)

In [8]:
df['vector'] = vectors.tolist()

In [9]:
len(df.iloc[0,5])

384

### Baza Wektorowa

In [10]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

In [13]:
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='movies')])

In [12]:
# utwórz kolekcję
from qdrant_client import models

client.create_collection(
    collection_name="movies",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE),
    quantization_config=models.ScalarQuantization(
        scalar=models.ScalarQuantizationConfig(
            type=models.ScalarType.INT8,
            quantile=0.99,
            always_ram=True,
        ),
    ),
)

True

In [15]:
# dodaj wektory
from qdrant_client.models import PointStruct

points = [PointStruct(id=ix, vector=serie['vector'], payload={k:v for k,v in serie.items() if k != 'vector'}) for ix, serie in df.iterrows()]

In [16]:
operation_info = client.upsert(
    collection_name="movies",
    wait=True,
    points=points
)

print(operation_info)

operation_id=0 status=<UpdateStatus.COMPLETED: 'completed'>


In [17]:
query_vector = df.loc[506, 'vector']

In [18]:
# wyszukaj
search_result = client.query_points(
    collection_name="movies",
    query=query_vector,
    with_payload=True,
    limit=3
).points

search_result

[ScoredPoint(id=506, version=0, score=1.0, payload={'Series_Title': 'Harry Potter and the Prisoner of Azkaban', 'Genre': 'Adventure, Family, Fantasy', 'IMDB_Rating': 7.9, 'Released_Year': '2004', 'text': 'Harry Potter, Ron and Hermione return to Hogwarts School of Witchcraft and Wizardry for their third year of study, where they delve into the mystery surrounding an escaped prisoner who poses a dangerous threat to the young wizard.'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=226, version=0, score=0.5016076, payload={'Series_Title': 'Harry Potter and the Deathly Hallows: Part 2', 'Genre': 'Adventure, Drama, Fantasy', 'IMDB_Rating': 8.1, 'Released_Year': '2011', 'text': "Harry, Ron, and Hermione search for Voldemort's remaining Horcruxes in their effort to destroy the Dark Lord as the final battle rages on at Hogwarts."}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=947, version=0, score=0.46186382, payload={'Series_Title': "Harry Potter and the 

In [None]:
# wyszukaj z filtrami
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchText

def get_most_similar_movie(title, genre, min_rate=0.7):
    try:
        query_vector = df.loc[df['Series_Title'] == title, 'vector'].values[0]
        query_title = df.loc[df['Series_Title'] == title, 'Series_Title'].values[0]
    except IndexError:
        print("Movie not found in the dataset.")
        return None

    search_result = client.query_points(
        collection_name="movies",
        query=query_vector,
        query_filter=Filter(
            must=[
                FieldCondition(key="Genre", match=MatchText(text=genre)), 
                FieldCondition(key="IMDB_Rating", range=models.Range(gt=min_rate))],
            must_not=[FieldCondition(key="Series_Title", match=MatchValue(value=query_title))]
        ),
        with_payload=True,
        limit=3,
    ).points

    return [dict(x) for x in search_result]

import numpy as np

def get_recommendation(dfhist:pd.DataFrame, min_rate=0.7, top_n=3):
    vectors = dfhist['vector'].tolist()
    query_vector = np.array(vectors).mean(axis=0).tolist()

    search_result = client.query_points(
        collection_name="movies",
        query=query_vector,
        query_filter=Filter(
            must=[
                FieldCondition(key="IMDB_Rating", range=models.Range(gt=min_rate))],
            must_not=[FieldCondition(key="Series_Title", match=MatchValue(value=title)) for title in dfhist['Series_Title'].tolist()]
        ),
        with_payload=True,
        limit=top_n
    ).points

    return [dict(x) for x in search_result]


In [27]:
dfhist = df.sample(10)
dfhist

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Released_Year,text,vector
991,Kelly's Heroes,"Adventure, Comedy, War",7.6,1970,A group of U.S. soldiers sneaks across enemy l...,"[-0.11310602724552155, 0.03082294389605522, -0..."
255,Fargo,"Crime, Drama, Thriller",8.1,1996,Jerry Lundegaard's inept crime falls apart due...,"[-0.10501310974359512, -0.06088097020983696, -..."
191,The Treasure of the Sierra Madre,"Adventure, Drama, Western",8.2,1948,Two Americans searching for work in Mexico con...,"[-0.0264334324747324, 0.007136609870940447, -0..."
322,Badhaai ho,"Comedy, Drama",8.0,2018,A man is embarrassed when he finds out his mot...,"[-0.012215599417686462, 0.02075621671974659, -..."
244,Amores perros,"Drama, Thriller",8.1,2000,A horrific car accident connects three stories...,"[-0.007581766694784164, -0.019533252343535423,..."
675,Back to the Future Part II,"Adventure, Comedy, Sci-Fi",7.8,1989,"After visiting 2015, Marty McFly must repeat h...","[0.01250038854777813, 0.02136446349322796, 0.0..."
799,"South Park: Bigger, Longer & Uncut","Animation, Comedy, Fantasy",7.7,1999,When Stan Marsh and his friends go see an R-ra...,"[0.02598275989294052, -0.028161192312836647, -..."
649,The Insider,"Biography, Drama, Thriller",7.8,1999,A research chemist comes under personal and pr...,"[-0.006619413383305073, 0.09721405804157257, -..."
414,Annie Hall,"Comedy, Romance",8.0,1977,Neurotic New York comedian Alvy Singer falls i...,"[-0.03995197266340256, -0.053826164454221725, ..."
578,Kubo and the Two Strings,"Animation, Action, Adventure",7.8,2016,A young boy named Kubo must locate a magical s...,"[-0.052760835736989975, 0.15156415104866028, -..."


In [28]:
get_recommendation(dfhist, min_rate=7.5, top_n=5)

[{'id': 73,
  'version': 0,
  'score': 0.5020988,
  'payload': {'Series_Title': 'The Shining',
   'Genre': 'Drama, Horror',
   'IMDB_Rating': 8.4,
   'Released_Year': '1980',
   'text': 'A family heads to an isolated hotel for the winter where a sinister presence influences the father into violence, while his psychic son sees horrific forebodings from both past and future.'},
  'vector': None,
  'shard_key': None,
  'order_value': None},
 {'id': 91,
  'version': 0,
  'score': 0.49274647,
  'payload': {'Series_Title': 'Miracle in cell NO.7',
   'Genre': 'Drama',
   'IMDB_Rating': 8.3,
   'Released_Year': '2019',
   'text': 'A story of love between a mentally-ill father who was wrongly accused of murder and his lovely six years old daughter. The prison would be their home. Based on the 2013 Korean movie 7-beon-bang-ui seon-mul (2013).'},
  'vector': None,
  'shard_key': None,
  'order_value': None},
 {'id': 505,
  'version': 0,
  'score': 0.48924923,
  'payload': {'Series_Title': 'Mystic

In [23]:
get_most_similar_movie('The Shawshank Redemption', 'Drama', 8.0)

[{'id': 253,
  'version': 0,
  'score': 0.5100496,
  'payload': {'Series_Title': 'Fa yeung nin wah',
   'Genre': 'Drama, Romance',
   'IMDB_Rating': 8.1,
   'Released_Year': '2000',
   'text': 'Two neighbors, a woman and a man, form a strong bond after both suspect extramarital activities of their spouses. However, they agree to keep their bond platonic so as not to commit similar wrongs.'},
  'vector': None,
  'shard_key': None,
  'order_value': None},
 {'id': 263,
  'version': 0,
  'score': 0.42094564,
  'payload': {'Series_Title': 'In the Name of the Father',
   'Genre': 'Biography, Crime, Drama',
   'IMDB_Rating': 8.1,
   'Released_Year': '1993',
   'text': "A man's coerced confession to an I.R.A. bombing he did not commit results in the imprisonment of his father as well. An English lawyer fights to free them."},
  'vector': None,
  'shard_key': None,
  'order_value': None},
 {'id': 27,
  'version': 0,
  'score': 0.40841138,
  'payload': {'Series_Title': 'Se7en',
   'Genre': 'Crim

### Hybrid Query

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(url="http://localhost:6333")

client.query_points(
    collection_name="{collection_name}",
    prefetch=[
        models.Prefetch(
            query=models.SparseVector(indices=[1, 42], values=[0.22, 0.8]),
            using="sparse",
            limit=20,
        ),
        models.Prefetch(
            query=[0.01, 0.45, 0.67],  # <-- dense vector
            using="dense",
            limit=20,
        ),
    ],
    query=models.FusionQuery(fusion=models.Fusion.RRF),
)

### Re-scoring example

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(url="http://localhost:6333")

client.query_points(
    collection_name="{collection_name}",
    prefetch=models.Prefetch(
        query=[1, 23, 45, 67],  # <------------- small byte vector
        using="mrl_byte",
        limit=1000,
    ),
    query=[0.01, 0.299, 0.45, 0.67],  # <-- full vector
    using="full",
    limit=10,
)

### Batch Query

In [64]:
def get_most_similar_batch(dfhist, min_rate=0.7):

    def get_filter(serie):
        query_filter=Filter(
            must=[
                FieldCondition(key="Genre", match=MatchText(text=serie['Genre'])), 
                FieldCondition(key="IMDB_Rating", range=models.Range(gt=min_rate))],
            must_not=[FieldCondition(key="Series_Title", match=MatchValue(value=serie['Series_Title']))]
            )
        return query_filter


    search_queries = [
        models.QueryRequest(query=serie['vector'], filter=get_filter(serie), limit=3, with_payload=True) for ix, serie in dfhist.iterrows()
    ]

    search_result = client.query_batch_points(collection_name="movies", requests=search_queries)
    results = [dict(x)['payload'] for search in search_result for x in search.points]
    results = sorted(results, key=lambda x: x['IMDB_Rating'], reverse=True)

    return results
    # return 

In [65]:
get_most_similar_batch(dfhist)

[{'Series_Title': 'Joker',
  'Genre': 'Crime, Drama, Thriller',
  'IMDB_Rating': 8.5,
  'Released_Year': '2019',
  'text': 'In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mistreated by society. He then embarks on a downward spiral of revolution and bloody crime. This path brings him face-to-face with his alter-ego: the Joker.'},
 {'Series_Title': 'Back to the Future',
  'Genre': 'Adventure, Comedy, Sci-Fi',
  'IMDB_Rating': 8.5,
  'Released_Year': '1985',
  'text': 'Marty McFly, a 17-year-old high school student, is accidentally sent thirty years into the past in a time-traveling DeLorean invented by his close friend, the eccentric scientist Doc Brown.'},
 {'Series_Title': 'Amélie',
  'Genre': 'Comedy, Romance',
  'IMDB_Rating': 8.3,
  'Released_Year': '2001',
  'text': 'Amélie is an innocent and naive girl in Paris with her own sense of justice. She decides to help those around her and, along the way, discovers love.'},
 {'Series_Title': 'Andaz Apna Apna',
