### import statements

In [2]:
from  pprint import pprint
import pandas as pd
from llama_cpp import Llama
import numpy as np
import re
import heapq

### defining some variables

In [3]:
data_type = np.float16
shortlisting_dim = 128
reranking_dim = 512

### dataset proocessing

In [4]:

# Read the CSV file into a DataFrame
df = pd.read_csv("D:\\Dataset\\movie_imdb_dataset\\action.csv")

# Extract the columns into separate lists
movie_names = df['movie_name'].tolist()[:3000]
genres = df['genre'].tolist()[:3000]
descriptions = df['description'].tolist()[:3000]

def structure_data(data_lst):
    data_str = ''
 
    for i in range(3):
        if (i == 2):
            data_str += data_lst[i].replace(".", " ").strip()
        else:

            data_str += data_lst[i].replace(".", " ").strip()+" . "
    data_str = re.sub(r'\s+', ' ', data_str).strip()
    data_str = 'search_document: ' + data_str
    return data_str

dataset_arr = []
for entity in zip(movie_names, genres, descriptions):
    entity = list(entity)
    # *** removing the data with no movie name ***
    if (type(entity[0]) != str or len(entity[0]) == 0):
        continue

    # *** converting no description to empty string ***

    if (entity[2] == "Add a Plot" or entity[2] == "Not available at this time." or entity[2] == "The plot is unknown at this time."or entity[2] == "Plot kept under wraps."):
        entity[2] = ""


    dataset_arr.append(structure_data(entity))


### model initalization

In [5]:
model_q4_k_m = Llama("../models/nomic-embed-large-v1.5-Q4_K_M.gguf",
                     embedding=True, verbose=False, use_mmap=True,
                     use_mlock=True)

In [6]:
def get_embeddings(input):
    return np.array(model_q4_k_m.embed(input), dtype=np.float16)


def truncate_embeddings(emb, truncate_dim):
    return emb[:, :truncate_dim]


def cosine_similarity(x, y):
    normalized_x = x/np.linalg.norm(x)
    normalized_y = y/np.linalg.norm(y)
    return np.dot(normalized_x, normalized_y)

In [None]:
dim_768_emb = get_embeddings(dataset_arr)

In [None]:
dim_768_emb.shape,dim_768_emb.dtype

In [None]:
np.save('./dim_768_emb_nomic.npy', dim_768_emb)

In [7]:
dim_768_emb = np.load('./dim_768_emb_nomic.npy')
dim_768_emb.shape,dim_768_emb.dtype

((3000, 768), dtype('float16'))

In [8]:
dim_512_emb = truncate_embeddings(dim_768_emb,512)
dim_512_emb.shape

(3000, 512)

In [9]:
dim_128_emb = truncate_embeddings(dim_768_emb,128)
dim_128_emb.shape

(3000, 128)

###  normalizing the vectors

In [10]:
for i in range(len(dataset_arr)):
    dim_768_emb[i, :] = dim_768_emb[i, :]/np.linalg.norm(dim_768_emb[i, :])
    dim_512_emb[i, :] = dim_512_emb[i, :]/np.linalg.norm(dim_512_emb[i, :])
    dim_128_emb[i, :] = dim_128_emb[i, :]/np.linalg.norm(dim_128_emb[i, :])

### converting float => binary (uint8)

In [11]:
def modified_signum(x):
    output = np.zeros_like(x)
    output[x <= 0] = 0
    output[x > 0] = 1
    return output.astype(np.uint8)

storing packed binary 128 dimensional vectors

In [12]:
bin_dim_128_emb =  modified_signum(
        dim_128_emb)
pack_bin_dim_128_emb = np.packbits(
    bin_dim_128_emb).reshape(dim_128_emb.shape[0], -1)
pack_bin_dim_128_emb.shape,pack_bin_dim_128_emb[0][:20]

In [13]:
np.save('pack_bin_dim_128_emb.npy',pack_bin_dim_128_emb)


In [14]:
pack_bin_dim_128_emb = np.load("pack_bin_dim_128_emb.npy")

In [15]:
pack_bin_dim_128_emb.shape

(3000, 16)

In [16]:
bin_dim_128_emb = np.unpackbits(pack_bin_dim_128_emb).reshape(pack_bin_dim_128_emb.shape[0],-1)

In [17]:
bin_dim_128_emb.shape

(3000, 128)

storing packed binary 512 dimensional vectors

In [18]:
bin_dim_512_emb =  modified_signum(
        dim_512_emb)
pack_bin_dim_512_emb = np.packbits(
    bin_dim_512_emb).reshape(dim_512_emb.shape[0], -1)
pack_bin_dim_512_emb.shape,pack_bin_dim_512_emb[0][:20]

In [19]:
np.save('pack_bin_dim_512_emb.npy',pack_bin_dim_512_emb)


In [20]:
pack_bin_dim_512_emb = np.load('pack_bin_dim_512_emb.npy')

In [21]:
pack_bin_dim_512_emb.shape

(3000, 64)

### testing out binary vector wihout  faiss

In [22]:


def hamming_distance(bin_vec1, bin_vec2):
    bin_vec1, bin_vec2 = bin_vec1.squeeze(), bin_vec2.squeeze()


    h_dist = 0

    len_vec = bin_vec1.shape[0]

    for i in range(len_vec):

        if bin_vec1[i] != bin_vec2[i]:


            h_dist += 1

    return h_dist

In [23]:
exp_query = [
    "search_query: Avengers"

]
exp_dim_128_query = truncate_embeddings(
    get_embeddings(exp_query), shortlisting_dim)
for i in range(len(exp_query)):
    exp_dim_128_query[i] = exp_dim_128_query[i] / \
        np.linalg.norm(exp_dim_128_query[i])

exp_bin_dim_128_query = modified_signum(exp_dim_128_query).squeeze()
exp_pack_bin_dim_128_query = np.packbits(
    exp_bin_dim_128_query
)

exp_indexed_score = []
for i, vec in enumerate(bin_dim_128_emb):
    score = hamming_distance(exp_bin_dim_128_query, vec)
    exp_indexed_score.append((score, i))

exp_top_score = heapq.nsmallest(10, exp_indexed_score, key=lambda x: x[0])
print(f'query :{exp_query[0]}')
for score, index in exp_top_score:
    print(f'result: {dataset_arr[index]}')
    print(f'score: {score}')
    print()
    

query :search_query: Avengers
result: search_document: The Avengers . Action, Adventure, Sci-Fi . Two British Agents team up to stop Sir August de Wynter from destroying the world with a weather-changing machine
score: 19

result: search_document: Avengers: The Kang Dynasty . Action, Adventure, Sci-Fi . Plot under wraps
score: 20

result: search_document: Avengers: Secret Wars . Action, Adventure, Sci-Fi . Plot under wraps
score: 21

result: search_document: The Avengers . Action, Sci-Fi . Earth's mightiest heroes must come together and learn to fight as a team if they are going to stop the mischievous Loki and his alien army from enslaving humanity
score: 22

result: search_document: The Omega Man . Action, Drama, Sci-Fi . Biological war has decimated life on Earth Los Angeles is a windswept ghost town where Robert Neville tools his convertible through sunlit streets foraging for supplies
score: 23

result: search_document: Captain America: Civil War . Action, Sci-Fi . Political invol

In [24]:
def convert_query_to_float(query, dim=reranking_dim):
    """
    float embeddings 
    """
    query = [query]
    float_emb = truncate_embeddings(
        get_embeddings(query), dim)  # (1,reranking_dim)
    norm_float_emb = float_emb / \
        np.linalg.norm(float_emb)  # (1,reranking_dim)
    return float_emb, norm_float_emb


def convert_float_to_binary(float_emb, dim=shortlisting_dim):
    """
    packed binary embeddings
    """
    if (float_emb.ndim == 1):
        s = float_emb[:dim]
        s = s.reshape(1,-1)
    else:
        s = truncate_embeddings(
            float_emb, dim)  # (1,shortlisting_dim)
 
    s = s/np.linalg.norm(s)  # (1,shortlisting_dim)
    bin_emb = modified_signum(s)  # (1,shortlisting_dim)

    return bin_emb




In [43]:
def shortlisting(bin_emb, top_s):
    indexed_distance: list = list()
    for i, vec in enumerate(bin_dim_128_emb):

        distance = hamming_distance(bin_emb, vec)
        indexed_distance.append((distance, i))
    shortlisted_distance = heapq.nsmallest(
        top_s, indexed_distance, key=lambda x: x[0])
    shortlisting_result = [(dataset_arr[index], distance, index)
                           for (distance, index) in shortlisted_distance]

    return shortlisting_result


def reranking(shortlisting_result, norm_float_emb, top_r):

    indexed_score: list = list()

    text_arr: list = list()

    for i, (text, distance, index) in enumerate(shortlisting_result):

        bin_emb = np.unpackbits(
            pack_bin_dim_512_emb[index]
        ).reshape(1, -1)  # (1,reranking_dim)

        score = round(float(
            np.matmul(norm_float_emb_query, bin_emb.T).squeeze()
        ), 3)

        indexed_score.append((score, i))

        text_arr.append(text)

    top_r_result = heapq.nlargest(top_r, indexed_score, key=lambda x: x[0])

    reranking_result = [(score, text_arr[index])
                        for (score, index) in top_r_result]

    return reranking_result

In [44]:

float_emb_query, norm_float_emb_query = convert_query_to_float(
    "search_query: fastx")
bin_emb_query = convert_float_to_binary(float_emb_query)
shortlisting_result = shortlisting(
    bin_emb_query, 50
)
reranking_result = reranking(shortlisting_result, norm_float_emb_query, 10)

In [45]:
reranking_result

[(6.348,
  'search_document: Fast X . Action, Crime, Mystery . Dom Toretto and his family are targeted by the vengeful son of drug kingpin Hernan Reyes'),
 (6.254,
  "search_document: F9: The Fast Saga . Action, Crime, Thriller . Dom and the crew must take on an international terrorist who turns out to be Dom and Mia's estranged brother"),
 (5.711,
  "search_document: The Fast and the Furious . Action, Crime, Thriller . Los Angeles police officer Brian O'Conner must decide where his loyalty really lies when he becomes enamored with the street racing world he has been sent undercover to destroy"),
 (5.703,
  'search_document: The Fast and the Furious: Tokyo Drift . Action, Crime, Thriller . A teenager becomes a major competitor in the world of drift racing after moving in with his father in Tokyo to avoid a jail sentence in America'),
 (5.348,
  "search_document: Rush Hour 3 . Action, Comedy, Crime . After an attempted assassination on Ambassador Han, Lee and Carter head to Paris to pro

In [46]:
shortlisting_result

[("search_document: F9: The Fast Saga . Action, Crime, Thriller . Dom and the crew must take on an international terrorist who turns out to be Dom and Mia's estranged brother",
  27,
  48),
 ('search_document: Fast X . Action, Crime, Mystery . Dom Toretto and his family are targeted by the vengeful son of drug kingpin Hernan Reyes',
  28,
  4),
 ("search_document: Iron Mask . Action, Adventure, Fantasy . Early 1700: Cartographer Jonathan Green (Jason Flemyng) from Forbidden Empire (2014) is back to map the Russian Far East He's forced on to China, where he confronts the Dragon Master et al The iron masked Russian Czar escapes the Tower of London to a Russian ship",
  30,
  1456),
 ('search_document: Mission: Impossible - Dead Reckoning - Part Two . Action, Adventure, Thriller . The 8th entry in the long running Mission Impossible franchise',
  32,
  1580),
 ('search_document: Malikappuram . Action, Drama . An intense desire of Kallu an 8 year old girl from Panchalimedu village to visit