### import statements

In [1]:
from  pprint import pprint
import pandas as pd
from llama_cpp import Llama
import numpy as np
import re
import faiss
import heapq

### defining some variables

In [2]:
data_type = np.float16
shortlisting_dim = 128
reranking_dim = 512

### dataset proocessing

In [3]:

# Read the CSV file into a DataFrame
df = pd.read_csv("D:\\Dataset\\movie_imdb_dataset\\action.csv")

# Extract the columns into separate lists
movie_names = df['movie_name'].tolist()[:3000]
genres = df['genre'].tolist()[:3000]
descriptions = df['description'].tolist()[:3000]

def structure_data(data_lst):
    data_str = ''
    for i in range(3):
        if (i == 2):
            data_str += data_lst[i].replace(".", " ").strip()
        else:

            data_str += data_lst[i].replace(".", " ").strip()+" . "
    data_str = re.sub(r'\s+', ' ', data_str).strip()
    data_str = 'search_document: ' + data_str
    return data_str

dataset_arr = []
for entity in zip(movie_names, genres, descriptions):
    entity = list(entity)
    # *** removing the data with no movie name ***
    if (type(entity[0]) != str or len(entity[0]) == 0):
        continue

    # *** converting no description to empty string ***

    if (entity[2] == "Add a Plot" or entity[2] == "Not available at this time." or entity[2] == "The plot is unknown at this time."or entity[2] == "Plot kept under wraps."):
        entity[2] = ""


    dataset_arr.append(structure_data(entity))


### model initalization

In [4]:
model_q4_k_m = Llama("../models/nomic-embed-large-v1.5-Q4_K_M.gguf",
                     embedding=True, verbose=False,use_mmap=True,
                     use_mlock=True)

In [5]:
def get_embeddings(input):
    return np.array(model_q4_k_m.embed(input), dtype=np.float16)


def truncate_embeddings(emb, truncate_dim):
    return emb[:, :truncate_dim]


def cosine_similarity(x, y):
    normalized_x = x/np.linalg.norm(x)
    normalized_y = y/np.linalg.norm(y)
    return np.dot(normalized_x, normalized_y)

In [None]:
dim_768_emb = get_embeddings(dataset_arr)

In [None]:
dim_768_emb.shape,dim_768_emb.dtype

In [None]:
np.save('./dim_768_emb_nomic.npy', dim_768_emb)

In [6]:
dim_768_emb = np.load('./dim_768_emb_nomic.npy')
dim_768_emb.shape,dim_768_emb.dtype

((3000, 768), dtype('float16'))

In [7]:
dim_512_emb = truncate_embeddings(dim_768_emb,512)
dim_512_emb.shape

(3000, 512)

In [8]:
dim_128_emb = truncate_embeddings(dim_768_emb,128)
dim_128_emb.shape

(3000, 128)

###  normalizing the vectors

In [9]:
for i in range(len(dataset_arr)):
    dim_768_emb[i, :] = dim_768_emb[i, :]/np.linalg.norm(dim_768_emb[i, :])
    dim_512_emb[i, :] = dim_512_emb[i, :]/np.linalg.norm(dim_512_emb[i, :])
    dim_128_emb[i, :] = dim_128_emb[i, :]/np.linalg.norm(dim_128_emb[i, :])

### converting float => binary 

In [10]:
def modified_signum(x):
    output = np.zeros_like(x)
    output[x <= 0] = 0
    output[x > 0] = 1
    return output.astype(np.uint8)

In [11]:
pack_bin_dim_128_emb = np.packbits(
    modified_signum(
        dim_128_emb)).reshape(dim_128_emb.shape[0], -1)
pack_bin_dim_128_emb.shape,pack_bin_dim_128_emb[0][:20]

((3000, 16),
 array([221,  94,  50, 182, 174, 232,  11, 205, 146, 241, 173, 125,  74,
        217, 107, 189], dtype=uint8))

### pesistant storing of binary vector to faiss(these binary vectors are used for shortlisting purpose)

In [12]:
binary_index = faiss.IndexBinaryFlat(128)
print(binary_index.is_trained)
binary_index.add(pack_bin_dim_128_emb)
print(binary_index.ntotal)
faiss.write_index_binary(binary_index,"pack_bin_dim_128_emb.faiss")

True
3000


In [16]:
binary_index = faiss.read_index_binary('pack_bin_dim_128_emb.faiss')

### testing out binary vectors

In [17]:
test_query = [
    "search_query: Avatar"

]


test_dim_128_query = truncate_embeddings(get_embeddings(test_query), shortlisting_dim)

for i in range(len(test_query)):
    test_dim_128_query[i] = test_dim_128_query[i]/np.linalg.norm(test_dim_128_query[i])
test_pack_bin_dim_128_query = np.packbits(
    modified_signum(test_dim_128_query) > 0
).reshape(len(test_query), -1)
print(test_pack_bin_dim_128_query.shape)
test_k = 10
test_D, test_I = binary_index.search(test_pack_bin_dim_128_query, test_k)

for i, entities in enumerate(zip(test_I, test_D)):
    print(f'query: {test_query[i]}')
    for i in range(test_k):
        print(
            f'result: {dataset_arr[int(entities[0][i])]}\ndistance: {int(entities[1][i])}')
        print()
    print("------ ------")

(1, 16)
query: search_query: Avatar
result: search_document: Avatar 5 . Action, Adventure, Drama . Sequel of Avatar 4 (2026) and last movie of the "Avatar" saga The plot is unknown
distance: 23

result: search_document: Sword Art Online: Progressive - Aria of a Starless Night . Animation, Action, Adventure . High school student Asuna struggles to survive with a young swordsman after its revealed that she is trapped inside the game of Sword Art Online, where if your HP drops to zero, your brain will be destroyed in real life
distance: 23

result: search_document: Batman: Gotham by Gaslight . Animation, Action, Adventure . In an alternative Victorian Age Gotham City, Batman begins his war on crime while he investigates a new series of murders by Jack the Ripper
distance: 24

result: search_document: Samaritan . Action, Drama, Fantasy . A young boy learns that a superhero who was thought to have died after an epic battle twenty-five years ago may in fact still be alive
distance: 25

resul

### pesistant storing of binary vector using numpy built-in methods(these binary vectors are used for reranking purpose)

In [19]:
bin_dim_512_emb = modified_signum(dim_512_emb)
pack_bin_dim_512_emb = np.packbits(
    bin_dim_512_emb
).reshape(dim_512_emb.shape[0], -1)
pack_bin_dim_512_emb.shape, pack_bin_dim_512_emb[0][:20]

((3000, 64),
 array([221,  94,  50, 182, 174, 232,  11, 205, 146, 241, 173, 125,  74,
        217, 107, 189, 181,  24,  39, 231], dtype=uint8))

In [20]:
np.save('pack_bin_dim_512_emb.npy',pack_bin_dim_512_emb)


In [21]:
pack_bin_dim_512_emb = np.load('pack_bin_dim_512_emb.npy')

In [22]:
def convert_query_to_float(query, dim=reranking_dim):
    """
    float embeddings 
    """
    query = [query]
    float_emb = truncate_embeddings(
        get_embeddings(query), dim)  # (1,reranking_dim)
    norm_float_emb = float_emb / \
        np.linalg.norm(float_emb)  # (1,reranking_dim)
    return float_emb, norm_float_emb


def convert_float_to_binary(float_emb, dim=shortlisting_dim):
    """
    packed binary embeddings
    """
    if (float_emb.ndim == 1):
        s = float_emb[:dim]
        s = s.reshape(1,-1)
    else:
        s = truncate_embeddings(
            float_emb, dim)  # (1,shortlisting_dim)
 
    s = s/np.linalg.norm(s)  # (1,shortlisting_dim)
    bin_emb = modified_signum(s)  # (1,shortlisting_dim)
    pack_bin_emb = np.packbits(bin_emb).reshape(
        1, -1)  # (1,shortlisting_dim/8)
    return bin_emb, pack_bin_emb




In [23]:
def simplify_distance_and_index(distance, index):
    distance, index = distance.squeeze(), index.squeeze()
    return [(dataset_arr[i[1]], i[0],i[1])for i in zip(distance, index)]


In [30]:
def shortlisting(pack_bin_emb, top_s):
    distance, index = binary_index.search(pack_bin_emb, top_s)
    shortlisting_result = simplify_distance_and_index(distance, index)
    return shortlisting_result


def reranking(shortlisting_result, norm_float_emb, top_r):
    indexed_score = []
    indexed_text = []
    for i, (text, distance, index) in enumerate(shortlisting_result):
        bin_emb= np.unpackbits(
            pack_bin_dim_512_emb[index]
        ).reshape(1, -1)  # (1,reranking_dim)
        score = round(float(
            np.matmul(norm_float_emb_query, bin_emb.T).squeeze()
        ), 3)
        indexed_score.append((score, i))
        indexed_text.append((text, i))
    top_r_result = heapq.nlargest(top_r, indexed_score, key=lambda x: x[0])
    reranking_result = [(score, indexed_text[index][0])
                        for (score, index) in top_r_result]
    return reranking_result

In [31]:

float_emb_query, norm_float_emb_query = convert_query_to_float(
    "search_query: avatar")
_,pack_bin_emb_query = convert_float_to_binary(float_emb_query)
shortlisting_result = shortlisting(
    pack_bin_emb_query, 50
)
reranking_result = reranking(shortlisting_result, norm_float_emb_query, 10)

In [32]:
reranking_result

[(7.41,
  'search_document: Avatar 5 . Action, Adventure, Drama . Sequel of Avatar 4 (2026) and last movie of the "Avatar" saga The plot is unknown'),
 (7.41,
  'search_document: Avatar . Action, Adventure, Fantasy . A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home'),
 (7.137,
  'search_document: Avatar 4 . Action, Adventure, Fantasy . Sequel of Avatar 3 (2024) The plot is unknown'),
 (7.027,
  'search_document: The Last Airbender . Action, Adventure, Family . Follows the adventures of Aang, a young successor to a long line of Avatars, who must master all four elements and stop the Fire Nation from enslaving the Water Tribes and the Earth Kingdom'),
 (6.996,
  'search_document: Avatar 3 . Action, Adventure, Fantasy . Sequel of Avatar: The Way of Water (2022) The plot is unknown'),
 (6.625,
  'search_document: Sword Art Online: Progressive - Aria of a Starless Night . Animation,

In [33]:
shortlisting_result

[('search_document: Avatar 5 . Action, Adventure, Drama . Sequel of Avatar 4 (2026) and last movie of the "Avatar" saga The plot is unknown',
  23,
  2021),
 ('search_document: Sword Art Online: Progressive - Aria of a Starless Night . Animation, Action, Adventure . High school student Asuna struggles to survive with a young swordsman after its revealed that she is trapped inside the game of Sword Art Online, where if your HP drops to zero, your brain will be destroyed in real life',
  23,
  2049),
 ('search_document: Batman: Gotham by Gaslight . Animation, Action, Adventure . In an alternative Victorian Age Gotham City, Batman begins his war on crime while he investigates a new series of murders by Jack the Ripper',
  24,
  2444),
 ('search_document: Samaritan . Action, Drama, Fantasy . A young boy learns that a superhero who was thought to have died after an epic battle twenty-five years ago may in fact still be alive',
  25,
  334),
 ('search_document: Avatar 3 . Action, Adventure, 