In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
item_path = os.path.join("../data/raw/ml-100k", "u.item")

In [4]:
items = pd.read_csv(
    item_path,
    sep="|",
    header=None,
    encoding="latin-1",
    usecols=[0, 1],
    names=["item_id", "title"]  
)

In [5]:
items.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
items.shape

(1682, 2)

In [6]:
items = items.dropna(subset=["title"])

In [8]:
max_features = 5000
tfidf = TfidfVectorizer(stop_words="english", max_features=max_features)
tfidf_matrix = tfidf.fit_transform(items["title"])
sim_matrix = cosine_similarity(tfidf_matrix)

In [None]:
sim_matrix = [
    Titanic, Avatar, Inception,Interstellar
    [1.00, 0.85, 0.10, 0.20],  # Titanic
    [0.85, 1.00, 0.15, 0.25],  # Avatar
    [0.10, 0.15, 1.00, 0.90],  # Inception
    [0.20, 0.25, 0.90, 1.00],  # Interstellar
]

sim_matrix[2]
[0.10, 0.15, 1.00, 0.90]

This means:

0. Inception vs Inception: 1.00
1. Inception vs Interstellar: 0.90
2. Inception vs Avatar: 0.15
3. Inception vs Titanic: 0.10







In [9]:
class MovieRecommenderModel:
    def __init__(self, items, sim_matrix):
        self.item_lookup = items
        self.sim_matrix = sim_matrix
        
    def predict(self, model_input: pd.DataFrame):
        
        results = []
        
        for _, row in model_input.iterrows():
            query = str(row.get("title", ""))
            top_k = int(row.get("top_k", 5))
            
            matches = self.item_lookup[
                self.item_lookup["title"].str.contains(query, case=False, na=False)
            ]
            
            if len(matches) == 0:
                results.append([])
                continue
            
            idx = matches.index[0]
            
            sim_scores = list(enumerate(self.sim_matrix[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            
            top_indices = [i for i, s in sim_scores[1 : top_k + 1]]
            recommended = self.item_lookup.iloc[top_indices]["title"].tolist()
            
            results.append(recommended)
        return results
            
               
        
        

In [10]:
input_df = pd.DataFrame(
        [
            {"title": "Philadelphia Story", "top_k": 5},
            {"title": "Toy Story", "top_k": 5},
        ]
    )

In [11]:
input_df.head()

Unnamed: 0,title,top_k
0,Philadelphia Story,5
1,Toy Story,5


In [12]:
model = MovieRecommenderModel(items=items, sim_matrix=sim_matrix)
predictions = model.predict(input_df)
predictions

[['Philadelphia (1993)',
  'Pinocchio (1940)',
  'Fantasia (1940)',
  'Rebecca (1940)',
  'Toy Story (1995)'],
 ["Pyromaniac's Love Story, A (1995)",
  'Story of Xinghua, The (1993)',
  'Philadelphia Story, The (1940)',
  'NeverEnding Story III, The (1994)',
  'FairyTale: A True Story (1997)']]