In [None]:
# Pull in the MovieLens movie data
ml_movies = (pd.read_csv("ml-latest/movies.csv").drop(columns = ["genres"])
             .rename(columns = {"movieId":"movie_id","title":"movie_title"}))

# Peek
ml_movies.head()

In [None]:
# 58,098 movies
ml_movies.shape

In [None]:
# Create a "movie_title" column in plot_df with same format as the ml_movies dataframe
plot_df["movie_title"] = plot_df["Title"].map(str) + " (" + plot_df["Release Year"].map(str) + ")"

# Peek
plot_df.head()

In [None]:
# Merge plot_df with ml_movies to create movie_titles (only movies in both datasets for model metric
movie_titles = (plot_df.merge(ml_movies, on = ["movie_title"]).loc[:,["movie_id","movie_title"]]
                .sort_values(by = ["movie_id"], ascending = True).reset_index(drop = True))

# Peek
movie_titles.head()

In [None]:
# Read in the ratings data
ratings_df = (pd.read_csv("ml-latest/ratings.csv").drop(columns = ["timestamp"])
              .rename(columns = {"userId":"user_id","movieId":"movie_id"}))

# Merge with the movie titles dataframe, then drop movie_id (unneeded after merge)
ratings_df = (ratings_df.merge(movie_titles, on = ["movie_id"]).drop(columns = ["movie_id"])
              .sort_values(by = ["user_id"], ascending = True)
              .reset_index(drop = True)
              .loc[:,["movie_title","user_id","rating"]])

# Peek
ratings_df.head()

In [None]:
# Create a dataframe of users and the movies they watched
user_movies_df = (ratings_df.groupby(["user_id"])
                 .agg({"movie_title":list}).reset_index())

# Create a dict of users and movies they watched
users = user_movies_df["user_id"].tolist()
movies = user_movies_df["movie_title"].tolist()

user_movie_dict = {}
for i in range(len(user_movies_df)):
    user_movie_dict[users[i]] = movies[i]

In [None]:
# Create a dataframe of movies and users that watched it
movie_users = (ratings_df.groupby(["movie_title"])
               .agg({"user_id":list}).reset_index())

# Create a dict of movies and users that watched it
movies = movie_users["movie_title"].tolist()
users = movie_users["user_id"].tolist()

movie_user_dict = {}
for i in range(len(movie_users)):
    movie_user_dict[movies[i]] = users[i]

In [None]:
def evaluate_model(movie_input, movie_rec):
    '''
    Evaluates whether the model's movie recommendation was good
    for the movie inputted based on users who liked the movie
    inputted's watch history.
    '''
    
    # Create a list of movies that were liked by users who liked the movie inputted
    also_liked = []
    
    # Iterates through each user that liked the movie inputted
    for user in movie_user_dict[movie_input]:
        also_liked.extend(user_movie_dict[user])
        
    # Create a dictionary of occurences per movie
    also_liked = dict(Counter(also_liked))
    
    # Sort it from highest to lowest
    also_liked = sorted(also_liked.items(), key = operator.itemgetter(1))[-2::-1]
    also_liked = [movie[0] for movie in also_liked]
    
    # Take only the top half sorted by frequency to make the evaluation more strict
    halfway = int(len(also_liked) / 2)
    
    # If the movie appeared in the list, it was a good recommendation
    if movie_rec in also_liked[:halfway]:
        print("Good Recommendation")
    else:
        print("Bad Recommendation")

In [None]:
evaluate_model("Apollo 13 (1995)","Space Cowboys (2000)")

### Collaborative Filtering

In [None]:
# Movie id --> title
id_to_title = dict(movie_rating_df["Movie Title"])
id_to_title[0]

In [None]:
movie_titles = movie_rating_df["Movie Title"].tolist()

In [None]:
# this separates the list of users in each row into dummies
from sklearn.preprocessing import MultiLabelBinarizer
split = movie_rating_coo['user_id']

# Initiate object
mlb = MultiLabelBinarizer()

colab_df = pd.DataFrame(mlb.fit_transform(split),columns=mlb.classes_)
# swap rows and columns
# have the user_id as the rows
colab_df = colab_df.transpose()
colab_df.columns = movie_titles
colab_df.head()

In [None]:
colab_titles = colab_df.columns.tolist()

In [None]:
from scipy.sparse import coo_matrix

# Convert sparse matrix to a coordinator matrix
coo = coo_matrix(colab_df)

# NMF Model with 5 topics
nmf_model = NMF(n_components=5, init='random', random_state=42)

# User vectors with movie topics
user_vec = nmf_model.fit_transform(coo)

# Movie topic
movie_vec = nmf_model.components_.transpose()

# Cosine distances for user/movies
user_dist = pairwise_distances(user_vec,movie_vec,metric='cosine')

# user_input is the places the user has gone and liked
# should be a list of city AND country
def collaborative_recommender(user_input):
    
    # new user hasnt visited any places yet
    new_user_input = np.zeros(colab_df.shape[1])
    
    already_watched = []
    for movie in user_input:
        # Index of the movie they input
        index =  colab_titles.index(movie)
        
        # Change the index for that movie to 1 if the user watched the movie
        new_user_input[index] = 1
        
        # Append the movie index to the already watched list (so as to not recommend it)
        already_watched.append(index)
        
    # Coordinator matrix for the new_user_input
    new_coo = coo_matrix(new_user_input) 
    
    
    new_user = nmf_model.transform(new_coo) # transform only since we have the model already fitted
    new_user_dist = pairwise_distances(new_user,movie_vec,metric='cosine') # output would be for 1 person
    
    
    # argsort returns in the index of the 10 lowest cosine distances
    # index references city_country_dict to get the city name
    # new_user_dist is the pair-wise distances for that user to each movie
    # we are calling the [0]th index of that because the output was a list of lists of just 1
    return [id_to_title[movie] for movie in new_user_dist[0].argsort()[:10] if movie not in already_watched]





In [None]:
users = []
user_avg_movies = []

for user in user_movie_dict:
    print(f"{user} / {len(user_movie_dict.keys())}")
    movie_sum = np.zeros(50)
    
    # For each movie_title in the user's history
    for idx, movie in enumerate(user_movie_dict[user]):

        # Convert the movie_title to the movie_id
        movie = movie_to_id[movie[:-7]]

        # Append the movie vector to the list
        movie_sum += doc_topic[movie]
    
    # Generate the average of all the movies
    total_movies = len(user_movie_dict[user])
    average_movies = movie_sum / total_movies
    
    # Append to lists
    users.append(user)
    user_avg_movies.append(average_movies)