In [1]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/sample_titles.csv")

import ast
df["genres"] = df["genres"].apply(ast.literal_eval)

df.head()


Unnamed: 0,title,genres,popularity,overview
0,The Matrix,"[Action, Sci-Fi]",9.8,A computer hacker learns about the true nature...
1,Inception,"[Action, Sci-Fi, Thriller]",9.7,A skilled thief leads a team into people's dre...
2,Blade Runner,"[Sci-Fi, Drama]",8.9,A blade runner must pursue and terminate repli...
3,Arrival,"[Sci-Fi, Drama]",8.6,A linguist works with the military to communic...
4,Her,"[Romance, Sci-Fi, Drama]",8.4,A lonely writer develops an unlikely relations...


In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")

embeddings = model.encode(
    df["overview"].fillna("").tolist(),
    show_progress_bar=True
)

df["embedding"] = list(embeddings)


Batches: 100%|██████████| 1/1 [00:00<00:00, 18.56it/s]


In [6]:
df["embedding"].iloc[0].shape


(384,)

In [7]:
np.vstack(df["embedding"].values).shape


(6, 384)

In [8]:
def semantic_similarity(df, title):
    idx = df.index[df["title"] == title][0]

    query_embedding = df.loc[idx, "embedding"].reshape(1, -1)
    all_embeddings = np.vstack(df["embedding"].values)

    return cosine_similarity(query_embedding, all_embeddings)[0]


In [9]:
def genre_overlap(df, title):
    idx = df.index[df["title"] == title][0]
    target_genres = set(df.loc[idx, "genres"])

    return df["genres"].apply(
        lambda g: len(target_genres.intersection(set(g)))
    )


In [10]:
def normalize(series):
    return (series - series.min()) / (series.max() - series.min() + 1e-6)


In [11]:
def hybrid_recommender(df, title, top_n=5,
                       w_sim=0.6, w_genre=0.2, w_pop=0.2):

    sim = semantic_similarity(df, title)
    genre = genre_overlap(df, title)
    pop = normalize(df["popularity"])

    score = (
        w_sim * normalize(pd.Series(sim)) +
        w_genre * normalize(genre) +
        w_pop * pop
    )

    df["hybrid_score"] = score

    idx = df.index[df["title"] == title][0]

    return (
        df.sort_values("hybrid_score", ascending=False)
          .iloc[1 : top_n + 1][
              ["title", "hybrid_score", "genres", "popularity"]
          ]
    )


In [12]:
hybrid_recommender(df, "The Matrix")


Unnamed: 0,title,hybrid_score,genres,popularity
1,Inception,0.501044,"[Action, Sci-Fi, Thriller]",9.7
2,Blade Runner,0.26488,"[Sci-Fi, Drama]",8.9
4,Her,0.242267,"[Romance, Sci-Fi, Drama]",8.4
5,The Godfather,0.2,"[Crime, Drama]",9.9
3,Arrival,0.145496,"[Sci-Fi, Drama]",8.6


In [13]:
hybrid_recommender(df, "The Matrix", w_sim=0.8, w_genre=0.1, w_pop=0.1)


Unnamed: 0,title,hybrid_score,genres,popularity
1,Inception,0.356948,"[Action, Sci-Fi, Thriller]",9.7
4,Her,0.239689,"[Romance, Sci-Fi, Drama]",8.4
2,Blade Runner,0.214285,"[Sci-Fi, Drama]",8.9
5,The Godfather,0.1,"[Crime, Drama]",9.9
3,Arrival,0.088439,"[Sci-Fi, Drama]",8.6
