In [1]:
import tarfile
import numpy as np
import os
from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
    ServiceContext,
    Document
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index import StorageContext, load_index_from_storage

# Load main model

In [2]:
# build index
service_context = ServiceContext.from_defaults(llm=None, embed_model="local:BAAI/bge-large-en-v1.5")

storage_context = StorageContext.from_defaults(persist_dir="film_summaries_index_dump")

# load index
index = load_index_from_storage(storage_context, service_context=service_context,
                                        show_progress = True)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=100000,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(service_context=service_context,
                                                response_mode="compact")

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.0)],
)

LLM is explicitly disabled. Using MockLLM.


Demo:

In [3]:
query = (
    "A dystopian film where teenagers between the ages of 12 and 18 from twelve districts are selected by lottery to participate "
    "in a televised battle (games) to the death, set in a future, post-apocalyptic society. There was smth about mockingbird"
)
response = query_engine.retrieve(query)
matches = [(node.metadata, node.text) for node in response]

In [3]:
import numpy as np
import pandas as pd

from tqdm import tqdm

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances

from lightfm.data import Dataset

from math import sqrt

import scipy.sparse as sp
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
import scipy 

movies_ratings = pd.read_csv('../metrics-recsys/long_ratings.csv')
movies_ratings



Unnamed: 0,Wikipedia_movie_ID,userId,rating
0,975900,251,2.0
1,975900,741,2.5
2,975900,1888,3.5
3,975900,1932,1.5
4,975900,2547,2.5
...,...,...,...
7225385,34980460,143508,3.5
7225386,34980460,154703,3.0
7225387,34980460,172224,3.5
7225388,34980460,201236,3.5


In [4]:
dataset = Dataset()
dataset.fit(users=movies_ratings['userId'].unique(), items=movies_ratings['Wikipedia_movie_ID'].unique())

In [5]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    "users_mapping": lightfm_mapping[0],
    "user_features_mapping": lightfm_mapping[1],
    "items_mapping": lightfm_mapping[2],
    "item_features_mapping": lightfm_mapping[3],
}
lightfm_mapping["users_inv_mapping"] = {v: k for k, v in lightfm_mapping["users_mapping"].items()}
lightfm_mapping["items_inv_mapping"] = {v: k for k, v in lightfm_mapping["items_mapping"].items()}

In [6]:
movies_ratings_train = pd.read_csv('../metrics-recsys/train_test/train_matrix.csv')
movies_ratings_test = pd.read_csv('../metrics-recsys/train_test/test_matrix.csv')

In [7]:
movies_ratings_train[movies_ratings_train['rating'] >= 4]

Unnamed: 0,Wikipedia_movie_ID,userId,rating
1,99463,147103,4.0
6,30327,239701,4.0
7,76361,201800,4.5
8,301574,124931,4.0
10,286893,162716,4.5
...,...,...,...
5780299,8514531,43052,4.0
5780301,25920477,125294,4.0
5780305,75264,249169,4.5
5780306,535971,9577,4.0


In [8]:
users_unique = movies_ratings_train.userId.unique()

In [9]:
def extract_k_best_ratings(users_unique, k):
    #matr_ratings_user = np.zeros([users_unique.shape[0], k], dtype='int')
    lst = []
    for index, user in enumerate(users_unique):
        dct = {}
        ratings_user = movies_ratings_train[movies_ratings_train['userId'] == user].sort_values(by='rating', ascending=False)['Wikipedia_movie_ID'].head(k)
        dct[user] = ratings_user.values
        lst.append(dct)
    return lst

In [10]:
index_best_ratings = extract_k_best_ratings(users_unique, k=4)

In [11]:
index_best_ratings

[{240727: array([5441262,   30006, 2406029,  817057], dtype=int64)},
 {147103: array([471352, 924771,  29782, 470185], dtype=int64)},
 {107720: array([101954,  43452,  99031, 142456], dtype=int64)},
 {47514: array([872386,  68485, 725459,  74830], dtype=int64)},
 {138516: array([ 598346,   92605,   54173, 1349086], dtype=int64)},
 {107487: array([1645598, 6322029,  146236,  261107], dtype=int64)},
 {239701: array([  44122,  689969,  297111, 1533664], dtype=int64)},
 {201800: array([ 239086,  133462, 1883157,   66870], dtype=int64)},
 {124931: array([4836350,   57585,   42856,  195246], dtype=int64)},
 {18206: array([  99031,  349941,  106328, 1653423], dtype=int64)},
 {162716: array([  30625,   89631,   81503, 4186631], dtype=int64)},
 {142984: array([ 167724,   10487, 9796711, 1150634], dtype=int64)},
 {157421: array([    4560,  2000074,    43566, 24319139], dtype=int64)},
 {171270: array([   8994, 2466773,   42159, 1586160], dtype=int64)},
 {67420: array([ 612052, 1458752,   57820,  

In [12]:
file_path = '../metrics-recsys/plot_summaries.txt'
movies = pd.read_csv(file_path, sep="\t", header=None)
movies.columns = ["Wikipedia movie ID", "movie_summary"]

In [13]:
movies

Unnamed: 0,Wikipedia movie ID,movie_summary
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...
...,...,...
42298,34808485,"The story is about Reema , a young Muslim scho..."
42299,1096473,"In 1928 Hollywood, director Leo Andreyev look..."
42300,35102018,American Luthier focuses on Randy Parsons’ tra...
42301,8628195,"Abdur Rehman Khan , a middle-aged dry fruit se..."


In [14]:
def make_prompts(index_best_ratings):
    prompts_lst = []
    for value in index_best_ratings:
        user, indexes = list(value.items())[0]
        dct = {}
        mask = movies['Wikipedia movie ID'].isin(indexes)
        result = movies[mask]['movie_summary']
        if result.shape[0] < 1:
            raise ValueError
        summaries_lst = result.to_list()
        prompt = "Here are the summaries of some films i like:\n"+"\n".join(summaries_lst)
        dct[user] = prompt
        prompts_lst.append(dct)
    return prompts_lst
                 

In [15]:
prompts_lst = make_prompts(index_best_ratings)

## Getting final results

### 1 batch

In [227]:
from tqdm import tqdm

all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[:1000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:03:28<00:00,  7.41s/it]


In [228]:
final_df.to_csv('../metrics-recsys/model_preds_batch1.csv', index=False)

In [18]:
lst = np.arange(1, 10, 1)

### 2 batch

In [22]:
from tqdm import tqdm

all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[1000:2000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:33:56<00:00,  9.24s/it]


In [23]:
final_df.to_csv('../metrics-recsys/model_preds_batch2.csv', index=False)

### 3 batch

In [25]:
from tqdm import tqdm

all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[2000:3000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:09:53<00:00,  7.79s/it]


In [26]:
final_df.to_csv('../metrics-recsys/model_preds_batch3.csv', index=False)

### 4 batch

In [27]:
all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[3000:4000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

final_df.to_csv('../metrics-recsys/model_preds_batch4.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:04:52<00:00,  7.49s/it]


### 5 batch

In [28]:
all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[4000:5000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

final_df.to_csv('../metrics-recsys/model_preds_batch5.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:05:58<00:00,  7.56s/it]


### 6 batch

In [16]:
all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[5000:6000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

final_df.to_csv('../metrics-recsys/model_preds_batch6.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:08:14<00:00,  7.69s/it]


### 7 batch

In [18]:
all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[6000:7000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

final_df.to_csv('../metrics-recsys/model_preds_batch7.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:08:38<00:00,  7.72s/it]


### 8 batch

In [17]:
all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[7000:8000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

final_df.to_csv('../metrics-recsys/model_preds_batch8.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:04:09<00:00,  7.45s/it]


### 9 batch

In [18]:
all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[8000:9000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

final_df.to_csv('../metrics-recsys/model_preds_batch9.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [2:05:07<00:00,  7.51s/it]


### 10 batch

In [19]:
all_info = []
final_df = pd.DataFrame()

for prompt in tqdm(prompts_lst[9000:10000]):
    userid, prompt = list(prompt.items())[0]
    query = prompt
    response = query_engine.retrieve(query)
    matches = [(userid, int(node.metadata['FilmID']), node.score)  for node in response]
    matches_df = pd.DataFrame(matches, columns=['userId', 'Wikipedia_movie_ID', 'score']).drop_duplicates(keep='first', subset=['userId', 'Wikipedia_movie_ID'])
    merged_df = movies_ratings_test[movies_ratings_test['userId'] == userid].merge(matches_df, how='inner').drop('rating', axis=1)
    final_df = pd.concat([final_df, merged_df])

final_df.to_csv('../metrics-recsys/model_preds_batch10.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 452/452 [56:48<00:00,  7.54s/it]
