# Nearest Neighbors approach

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, StandardScaler

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
users_path = '../data/raw/ml-100k/u.user'
ratings_path = '../data/raw/ml-100k/u.data'
movies_path = '../data/raw/ml-100k/u.item'

# Load users data
users_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv(users_path, sep='|', names=users_columns, encoding='latin-1')

# Load ratings data
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(ratings_path, sep='\t', names=ratings_columns, encoding='latin-1')

# Load movies data
movies_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + ['genre_' + str(i) for i in range(19)]
movies = pd.read_csv(movies_path, sep='|', names=movies_columns, encoding='latin-1', usecols=range(24))


In [4]:
# Combine datasets
data = pd.merge(pd.merge(ratings, users), movies)

# Ensure that the features include 'user_id'
features = data[['user_id', 'age', 'gender', 'occupation', 'zip_code', 'movie_id', 'rating']]
labels = data[['user_id', 'movie_id', 'rating']]

# Apply preprocessing to the appropriate columns (excluding 'user_id' and 'movie_id')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['gender', 'occupation', 'zip_code'])
    ])

# Split the dataset
X = preprocessor.fit_transform(features.drop(columns=['user_id', 'movie_id', 'rating']))
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
model = NearestNeighbors(n_neighbors=10, algorithm='brute')
model.fit(X_train)


In [6]:
def make_recommendation(user_id, n_recommendations=5):
    # Ensure 'user_id' is included in the features DataFrame
    user_data = data[data['user_id'] == user_id][['age', 'gender', 'occupation', 'zip_code', 'movie_id', 'rating']]

    # Preprocess the user's data
    user_data_processed = preprocessor.transform(user_data)

    # Find similar users/items (using more neighbors for further exlude already seen movies)
    distances, indices = model.kneighbors(user_data_processed, n_neighbors=n_recommendations * 3)
    recommended_movies = set(y.iloc[indices.flatten()]['movie_id'])
    
    # Remove movies that the user has already seen and return the final recommendations
    user_movies = set(data[data['user_id'] == user_id]['movie_id'])
    recommended_movies = recommended_movies - user_movies

    # Choose only the first 'n_recommendations' recommendations
    recommended_movies = list(recommended_movies)[:n_recommendations]

    # Return recommended movies
    recommendations = data[data['movie_id'].isin(recommended_movies)][['movie_id', 'title']].drop_duplicates().set_index('movie_id')
    
    return recommendations

# Make recommendations for a specific user
user_id = 1
print(make_recommendation(user_id, 10))


                                                      title
movie_id                                                   
306             Mrs. Brown (Her Majesty, Mrs. Brown) (1997)
663                                      Being There (1979)
580       Englishman Who Went Up a Hill, But Came Down a...
1118                                     Up in Smoke (1978)
411                             Nutty Professor, The (1996)
845                               That Thing You Do! (1996)
977                                  Substitute, The (1996)
333                                        Game, The (1997)
591                                      Primal Fear (1996)
742                                           Ransom (1996)


In [7]:
def precision_recall_at_k(model, X, y, k=5, rating_threshold=4.0):
    precisions = []
    recalls = []

    for user_id in np.unique(y['user_id']):
        # Get the indices and ratings for the user
        user_indices = np.where(y['user_id'].values == user_id)[0]
        user_ratings = y.iloc[user_indices]

        # Find movies rated above the threshold
        relevant_movies = set(user_ratings[user_ratings['rating'] >= rating_threshold]['movie_id'])

        # Skip users with fewer than k relevant movies
        if len(relevant_movies) < k:
            continue

        # Get recommendations
        user_data = X[user_indices, :]
        distances, indices = model.kneighbors(user_data, n_neighbors=k)
        recommended_movies = set(y.iloc[indices.flatten()]['movie_id'])

        # Calculate precision and recall
        n_relevant_and_recommended = len(relevant_movies.intersection(recommended_movies))
        precision = n_relevant_and_recommended / k
        recall = n_relevant_and_recommended / len(relevant_movies)

        precisions.append(precision)
        recalls.append(recall)

    # Calculate average precision and recall across all users
    avg_precision = np.mean(precisions) if precisions else 0
    avg_recall = np.mean(recalls) if recalls else 0

    return avg_precision, avg_recall


In [12]:
# Evaluate the model using P@k and R@k with rating threshold 3
precision, recall = precision_recall_at_k(model, X_train, y_train, k=10, rating_threshold=3.0)
print(f"Precision@10: {precision}")
print(f"Recall@10: {recall}")

Precision@10: 0.8376609442060087
Recall@10: 0.24107467352204673


In [9]:
# Evaluate the model using P@k and R@k with rating threshold 4
precision, recall = precision_recall_at_k(model, X_train, y_train, k=10, rating_threshold=4.0)
print(f"Precision@10: {precision}")
print(f"Recall@10: {recall}")

Precision@10: 0.6051703877790834
Recall@10: 0.21476492671392222


In [10]:
# save model
import pickle
pickle.dump(model, open('../models/nearest_neighbors.pkl', 'wb'))
pickle.dump(preprocessor, open('../models/nearest_neighbors_preprocessor.pkl', 'wb'))