In [1]:
import pandas as pd
import numpy as np

# ----------------------
# 1. Load Data
# ----------------------
df = pd.read_csv("cleaned_data.csv")
titles = df["title"].reset_index(drop=True)
title_to_index = {title: i for i, title in enumerate(titles)}


In [2]:
def pca_from_scratch(X, k):
    X_meaned = X - np.mean(X, axis=0)
    cov_mat = np.cov(X_meaned, rowvar=False)
    eig_vals, eig_vecs = np.linalg.eigh(cov_mat)
    sorted_indices = np.argsort(eig_vals)[::-1]
    top_k_eigvecs = eig_vecs[:, sorted_indices[:k]]
    X_reduced = np.dot(X_meaned, top_k_eigvecs)
    return X_reduced


In [3]:
def normalize_matrix(matrix):
    norms = np.linalg.norm(matrix, axis=1, keepdims=True)
    norms[norms == 0] = 1  # Prevent division by zero
    return matrix / norms

# ----------------------
# 4. Feature Preparation
# ----------------------
X = df.drop(columns=["title", "imdb_id"]).values

# Apply PCA to reduce dimensionality to 100
X_reduced = pca_from_scratch(X, k=100)

# Normalize and compute cosine similarity
X_normalized = normalize_matrix(X_reduced)
cosine_sim = np.dot(X_normalized, X_normalized.T)

In [4]:
def recommend_from_scratch(input_title, top_n=10):
    if input_title not in title_to_index:
        return f"'{input_title}' not found in dataset."

    idx = title_to_index[input_title]
    sim_scores = cosine_sim[idx]
    similar_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    recommended_titles = titles.iloc[similar_indices].tolist()

    return recommended_titles

In [6]:
if __name__ == "__main__":
    input_title = input("Enter a movie title: ")
    recommendations = recommend_from_scratch(input_title)

    print("\nTop 10 movie recommendations:")
    for i, title in enumerate(recommendations, start=1):
        print(f"{i}. {title}")


Top 10 movie recommendations:
1. Moon
2. 20,000 Leagues Under the Sea
3. Finch
4. The Zero Theorem
5. Beneath the Planet of the Apes
6. Fahrenheit 451
7. Nineteen Eighty-Four
8. Ex Machina
9. Z for Zachariah
10. Anon
