<a href="https://colab.research.google.com/github/mongoem/Climate-Change-Belief-analysis/blob/main/movie_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# prompt: unzip /content/genome_scores.zip

!unzip /content/genome_scores.zip


Archive:  /content/genome_scores.zip
replace genome_scores.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: genome_scores.csv       


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# Load the datasets
genome_scores = pd.read_csv('genome_scores.csv')
movies = pd.read_csv('movies.csv')
train = pd.read_csv('train.csv')

# Splitting the train.csv into new training and test sets
train_data, test_data = train_test_split(train, test_size=0.25, random_state=42)

# Preparing movie features based on genome_scores
# Pivot the genome_scores table to get a movie-feature matrix
movie_feature_matrix = genome_scores.pivot(index='movieId', columns='tagId', values='relevance').fillna(0)

# Create a mapping from movie ID to index in the feature matrix
movie_index = {movie: i for i, movie in enumerate(movie_feature_matrix.index)}

# Compute the cosine similarity between movies
similarity_matrix = cosine_similarity(movie_feature_matrix)

# Adjust predict_rating to use the movie_index for locating the correct movie in the similarity matrix
def predict_rating(user_id, movie_id):
    if movie_id not in movie_index:
        return np.nan  # Return NaN if movie_id is not in the similarity matrix

    idx = movie_index[movie_id]
    similar_movies = list(enumerate(similarity_matrix[idx]))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    top_similar = similar_movies[1:11]  # Skip the first entry as it's the movie itself

    top_similar_movie_ids = [movie_feature_matrix.index[i[0]] for i in top_similar]
    user_ratings = train_data[train_data['userId'] == user_id]
    top_similar_rated_movies = user_ratings[user_ratings['movieId'].isin(top_similar_movie_ids)]

    if not top_similar_rated_movies.empty:
        return np.dot(top_similar_rated_movies['rating'], top_similar_rated_movies['movieId']) / np.sum(top_similar_rated_movies['movieId'])
    else:
        return np.nan  # Return NaN if the user hasn't rated any similar movies

# Function to predict ratings for the test set
def predict_ratings_for_test_set(test_set):
    predictions = []
    actuals = []

    for index, row in test_set.iterrows():
        user_id = row['userId']
        movie_id = row['movieId']
        actual_rating = row['rating']

        predicted_rating = predict_rating(user_id, movie_id)

        if not np.isnan(predicted_rating):
            predictions.append(predicted_rating)
            actuals.append(actual_rating)

    return actuals, predictions

# Evaluate the model
actuals, predictions = predict_ratings_for_test_set(test_data)

# Calculate MAE and RMSE
mae = mean_absolute_error(actuals, predictions)
rmse = np.sqrt(mean_squared_error(actuals, predictions))

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
