<a href="https://colab.research.google.com/github/ohmp/movie_recommedation/blob/main/collaborative-filtering/collaborative-filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q qdrant-client python-dotenv

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/337.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m286.7/337.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.3/337.3 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
import os
import pandas as pd
import requests
from IPython.display import display, HTML
from qdrant_client import models,QdrantClient
from qdrant_client.http.models import PointStruct, SparseVector, NamedSparseVector
from collections import defaultdict
from dotenv import load_dotenv
from google.colab import userdata
load_dotenv()

# OMDB API Key
omdb_api_key = userdata.get("OMDB_API_KEY")

# Collection name
collection_name = "movies"

# Set Qdrant Client
qdrant_client = QdrantClient(
    userdata.get("QDRANT_HOST"),
    api_key=userdata.get("QDRANT_API_KEY")
)

In [22]:
# Function to get movie poster using OMDB API
def get_movie_poster(imdb_id, api_key):
    url = f"https://www.omdbapi.com/?i={imdb_id}&apikey={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('Poster', 'No Poster Found'), data
    return 'No Poster Found'

## Preparing the data

For experimental purposes, the dataset used in this example was [Movielens](https://files.grouplens.org/datasets/movielens/ml-latest.zip), with approximately 33,000,000 ratings and 86,000 movies.

But you can reproduce it with a smaller dataset if you wish; below are two alternatives:
- [Movielens Small](https://files.grouplens.org/datasets/movielens/ml-latest-small.zip)
- [The Movies Dataset from Kaggle](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/)

In [23]:
# Load CSV files
ratings_df = pd.read_csv('data/ratings.csv', low_memory=False)
ratings_df.head()

# Normalize ratings
#ratings_df['rating'] = (ratings_df['rating'] - ratings_df['rating'].mean()) / ratings_df['rating'].std()



Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [24]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [56]:
ratings_df.describe()

Unnamed: 0,userId,rating,timestamp
count,100836.0,100836.0,100836.0
mean,326.127564,2.12523e-16,1205946000.0
std,182.618491,1.0,216261000.0
min,1.0,-2.879111,828124600.0
25%,177.0,-0.4810963,1019124000.0
50%,325.0,-0.001493468,1186087000.0
75%,477.0,0.4781094,1435994000.0
max,610.0,1.437315,1537799000.0


In [26]:
links = pd.read_csv('data/links.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [27]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [28]:
# Add step to convert imdbId to tt format with leading zeros
links['imdbId'] = 'tt' + links['imdbId'].astype(str).str.zfill(7)

In [29]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,tt0114709,862.0
1,2,tt0113497,8844.0
2,3,tt0113228,15602.0
3,4,tt0114885,31357.0
4,5,tt0113041,11862.0


In [30]:
# Load CSV files
movies_df = pd.read_csv('data/movies.csv', low_memory=False)

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [31]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [32]:

# Convert movieId in ratings_df and movies_df to string
ratings_df['movieId'] = ratings_df['movieId'].astype(str)
movies_df['movieId'] = movies_df['movieId'].astype(str)

# Normalize ratings
ratings_df['rating'] = (ratings_df['rating'] - ratings_df['rating'].mean()) / ratings_df['rating'].std()

# Merge ratings with movie metadata to get movie titles
merged_df = ratings_df.merge(movies_df[['movieId', 'title']], left_on='movieId', right_on='movieId', how='inner')

# Aggregate ratings to handle duplicate (userId, title) pairs
ratings_agg_df = merged_df.groupby(['userId', 'movieId']).rating.mean().reset_index()

In [33]:
ratings_agg_df.count()

Unnamed: 0,0
userId,100836
movieId,100836
rating,100836


In [34]:
ratings_df.count()

Unnamed: 0,0
userId,100836
movieId,100836
rating,100836
timestamp,100836


In [17]:
movies_df.count()

Unnamed: 0,0
movieId,9742
title,9742
genres,9742


In [18]:
merged_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,0.478109,964982703,Toy Story (1995)
1,1,3,0.478109,964981247,Grumpier Old Men (1995)
2,1,6,0.478109,964982224,Heat (1995)
3,1,47,1.437315,964983815,Seven (a.k.a. Se7en) (1995)
4,1,50,1.437315,964982931,"Usual Suspects, The (1995)"


In [19]:
ratings_agg_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,0.478109
1,1,1009,-0.481096
2,1,101,1.437315
3,1,1023,1.437315
4,1,1024,1.437315


## Create a new Qdrant collection and send the data

In [35]:
qdrant_client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='movies')])

In [47]:
# Create a new Qdrant collection
# Check if collection exists, delete if it does
if qdrant_client.collection_exists(collection_name):
    qdrant_client.delete_collection(collection_name=collection_name)

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config={},
    sparse_vectors_config={
        "ratings": models.SparseVectorParams()
    }
)

True

In [49]:
# Convert ratings to sparse vectors
user_sparse_vectors = defaultdict(lambda: {"values": [], "indices": []})
#i = 0
for row in ratings_agg_df.itertuples():
    #if i > 5:
      #break
    #row_dict = row._asdict()
    #print(row)
    #print(row_dict)
    #i += 1
    user_sparse_vectors[row.userId]["values"].append(row.rating)
    user_sparse_vectors[row.userId]["indices"].append(int(row.movieId))

In [46]:
user_sparse_vectors

defaultdict(<function __main__.<lambda>()>,
            {1: {'values': [0.4781093879268121,
               -0.48109632308148476,
               1.4373150989351091,
               1.4373150989351091,
               1.4373150989351091,
               1.4373150989351091],
              'indices': [1, 1009, 101, 1023, 1024, 1025]}})

In [50]:


# Define a data generator
def data_generator():
    for user_id, sparse_vector in user_sparse_vectors.items():
        yield PointStruct(
            id=user_id,
            vector={"ratings": SparseVector(
                indices=sparse_vector["indices"],
                values=sparse_vector["values"]
            )},
            payload={"user_id": user_id, "movie_id": sparse_vector["indices"]}
        )

# Upload points using the data generator
qdrant_client.upload_points(
    collection_name=collection_name,
    points=data_generator()
)

## Making a recommendation

In [51]:
my_ratings = {
    603: 1,     # Matrix
    13475: 1,   # Star Trek
    11: 1,      # Star Wars
    1091: -1,   # The Thing
    862: 1,     # Toy Story
    597: -1,    # Titanic
    680: -1,    # Pulp Fiction
    13: 1,      # Forrest Gump
    120: 1,     # Lord of the Rings
    87: -1,     # Indiana Jones
    562: -1     # Die Hard
}

In [52]:
# Create sparse vector from my_ratings
def to_vector(ratings):
    vector = SparseVector(
        values=[],
        indices=[]
    )
    for movie_id, rating in ratings.items():
        vector.values.append(rating)
        vector.indices.append(movie_id)
    return vector

In [53]:
# Perform the search
results = qdrant_client.search(
    collection_name=collection_name,
    query_vector=NamedSparseVector(
        name="ratings",
        vector=to_vector(my_ratings)
    ),
    limit=20
)

# Convert results to scores and sort by score
def results_to_scores(results):
    movie_scores = defaultdict(lambda: 0)
    for result in results:
        for movie_id in result.payload["movie_id"]:
            movie_scores[movie_id] += result.score
    return movie_scores

# Convert results to scores and sort by score
movie_scores = results_to_scores(results)
top_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)

  results = qdrant_client.search(


In [54]:
# Create HTML to display top 5 results
html_content = "<div class='movies-container'>"

for movie_id, score in top_movies[:5]:
    imdb_id_row = links.loc[links['movieId'] == int(movie_id), 'imdbId']
    if not imdb_id_row.empty:
        imdb_id = imdb_id_row.values[0]
        poster_url, movie_info = get_movie_poster(imdb_id, omdb_api_key)
        movie_title = movie_info.get('Title', 'Unknown Title')

        html_content += f"""
        <div class='movie-card'>
            <img src="{poster_url}" alt="Poster" class="movie-poster">
            <div class="movie-title">{movie_title}</div>
            <div class="movie-score">Score: {score}</div>
        </div>
        """
    else:
        continue  # Skip if imdb_id is not found

html_content += "</div>"

display(HTML(html_content))