In [20]:
import pandas as pd 
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

rating_file = "/Users/ryanoliver/Projects/movie_reccomendation/ml-32m/ratings.csv"
movies_file = "/Users/ryanoliver/Projects/movie_reccomendation/ml-32m/movies.csv"

try:
    ratings = pd.read_csv(rating_file)
    movies = pd.read_csv(movies_file)
except FileNotFoundError as e :
    print(f"Error File not found:{e}. PLease check file path")
    exit()



print(f"Ratings data has been loaded:{ratings.shape}records.")
print(f"Movie data has been loaded:{movies.shape}records.")

Ratings data has been loaded:(32000204, 4)records.
Movie data has been loaded:(87585, 3)records.


In [21]:
df = pd.merge(ratings, movies, on="movieId")
df = df[['userId','title','rating']]

In [22]:
movie_count = df['title'].value_counts()
min_movie_ratings = 50
popular_movies = movie_count[movie_count >= min_movie_ratings].index
df_filtered = df[df['title'].isin(popular_movies)]

user_count = df['userId'].value_counts()
min_user_ratings = 50
active_users = user_count[user_count >= min_user_ratings].index
df_filtered = df_filtered[df_filtered['userId'].isin(active_users)]
print(f"Filtered dataframe shape: {df_filtered.shape}")

Filtered dataframe shape: (29211865, 3)


In [23]:
# Create sparse matrix directly to avoid memory crash
# Convert to categorical to get integer codes
df_filtered['title'] = df_filtered['title'].astype('category')
df_filtered['userId'] = df_filtered['userId'].astype('category')

# Create sparse matrix
# rows: title, columns: userId, values: rating
movie_sparse_matrix = csr_matrix(
    (df_filtered['rating'], 
     (df_filtered['title'].cat.codes, df_filtered['userId'].cat.codes))
)

print(f"User-Item Matrix shape: {movie_sparse_matrix.shape}")

# Keep track of titles for recommendation mapping later
movie_titles = df_filtered['title'].cat.categories

User-Item Matrix shape: (16034, 128344)


In [24]:
# Train KNN model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn_model.fit(movie_sparse_matrix)

0,1,2
,n_neighbors,20
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,-1


In [None]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors # Assuming this is the source of knn_model

def recommend_movies(movie_name, movie_titles, matrix, model, k):
    
    try:
        
        if isinstance(movie_titles, list):
            movie_index = movie_titles.index(movie_name)
        else: 
            if isinstance(movie_titles, pd.Series):
                matching_indices = movie_titles[movie_titles == movie_name].index
                if not matching_indices.empty:
                    movie_index = movie_titles.index.get_loc(matching_indices[0])
                else:
                    raise ValueError(f"Movie '{movie_name}' not found in the movie_titles Series.")
            elif isinstance(movie_titles, pd.Index):
                movie_index = movie_titles.get_loc(movie_name)
            else:
                raise TypeError("movie_titles must be a list, pandas Series, or pandas Index.")

        
        if not isinstance(movie_index, int):
            if isinstance(movie_index, slice):
                movie_index = movie_index.start 
            elif isinstance(movie_index, (pd.Int64Index, pd.BooleanArray)): 
                if not movie_index.empty:
                    movie_index = movie_index[0] 
                else:
                    raise ValueError(f"Movie '{movie_name}' not found or ambiguously defined in the movie_titles Index.")
            else:
                raise ValueError(f"Unexpected index type for movie '{movie_name}'.")

       
        movie_vector = matrix[movie_index].reshape(1, -1)

        # Find the k-nearest neighbors. We ask for k+1 because the movie itself will be the closest.
        distances, indices = model.kneighbors(movie_vector, n_neighbors=k + 1)

        recommendations = []
        
        for i in range(1, len(distances.flatten())):
            neighbor_index = indices.flatten()[i]
            
            # Get the movie title using the integer position
            if isinstance(movie_titles, list):
                movie_title = movie_titles[neighbor_index]
            elif isinstance(movie_titles, pd.Series):
                movie_title = movie_titles.iloc[neighbor_index]
            elif isinstance(movie_titles, pd.Index):
                movie_title = movie_titles[neighbor_index]
            else:
                
                movie_title = "Unknown Title" 
            
            similarity_score = distances.flatten()[i]
            recommendations.append((movie_title, similarity_score))
        
        return recommendations

    except (ValueError, IndexError) as e:
        return f"Movie '{movie_name}' not found in the dataset or an error occurred: {e}"
    except Exception as e:
        return f"An unexpected error occurred: {e}"



In [None]:

test_movie = "Pulp Fiction (1994)"
top_n = 5 


results = recommend_movies(
    movie_name=test_movie, 
    movie_titles=movie_titles, 
    matrix=movie_sparse_matrix, 
    model=knn_model, 
    k=top_n
)

if isinstance(results, str):
    print(results)
else:
    print(f"\nIf you liked **{test_movie}**, you might also enjoy these {top_n} movies:")
    for title, score in results:
        print(f"- **{title}** (Similarity: {score:.4f})")



If you liked **Pulp Fiction (1994)**, you might also enjoy these 5 movies:
- **Silence of the Lambs, The (1991)** (Similarity: 0.2584)
- **Shawshank Redemption, The (1994)** (Similarity: 0.2764)
- **Usual Suspects, The (1995)** (Similarity: 0.2979)
- **Seven (a.k.a. Se7en) (1995)** (Similarity: 0.2994)
- **Forrest Gump (1994)** (Similarity: 0.3084)
