### Steps:
1. Load and preprocess the dataset.
2. Extract features from movie genres.
3. Compute movie similarity using cosine similarity.
4. Build a function to recommend similar movies.

In [30]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [2]:
# movies = pd.read_csv("./ml-32m-preprocessed/movies.csv")
movies = pd.read_csv("./ml-32m/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
# Preprocess genres for feature extraction
# Convert genres into a format suitable for the TF-IDF Vectorizer
movies['genres'] = movies['genres'].fillna('')

In [12]:
# Convert genres into a single string format
movies['genres'] = movies['genres'].str.replace('|', ' ')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


### Compute TF-IDF Cosine Similarity

In [10]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words="english") # Remove common stop words

In [12]:
# Convert genres into numerical vectors
tfidf_matrix = tfidf.fit_transform(movies['genres'])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 166510 stored elements and shape (87585, 23)>

In [19]:
feature_names = tfidf.get_feature_names_out()
tfidf_matrix_test = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_matrix_test

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,fi,...,imax,listed,musical,mystery,noir,romance,sci,thriller,war,western
0,0.000000,0.458969,0.478003,0.480576,0.282693,0.0,0.000000,0.000000,0.499992,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.551895,0.000000,0.577877,0.000000,0.0,0.000000,0.000000,0.601223,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.596953,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.802276,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.534581,0.0,0.000000,0.445028,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.718452,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87580,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
87581,0.000000,0.000000,0.000000,0.000000,0.768544,0.0,0.000000,0.639797,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
87582,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
87583,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,1.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [29]:
# Memory error from this code
# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# cosine_sim

### Build a model
Note: USe KNN instead of cosine similarity (memory error)

In [32]:
# Initialize Nearest Neighbors model with cosine similarity
nn_model = NearestNeighbors(metric="cosine", algorithm="brute")
nn_model.fit(tfidf_matrix)

In [37]:
# Create a dictionary to map movie titles to their index
movie_indices = pd.Series(movies.index, index=movies["title"]).to_dict()