# Determine Movie Genre by Neighboring Movies

Using the k-nearest neighbors method, use the top-k most similar movies to a target film to predict the target movie's genre.

Use Jaccard similarity based on actors in each movie to rank movies and select the top-k most similar movies.

In [1]:
import json

import pandas as pd
import numpy as np

from scipy.sparse import lil_matrix

import matplotlib.pyplot as plt


In [3]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("movies.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
        
        # Skip movies with no ratings
        if len(this_movie["rating"]) == 0:
            continue
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"],
            "rating": this_movie["rating"]["avg"]
        })

In [4]:
print("Known Actors:", len(actor_name_map))
print("Known Movies:", len(movie_actor_map))

Known Actors: 29499
Known Movies: 18841


In [5]:
actor_id_to_index = {actor_id:i for i,actor_id in enumerate(actor_name_map.keys())}


In [6]:
target_movie_id = "tt0317705" # The Incredibles
# target_movie_id = "tt0816692" # Interstellar
# target_movie_id = "tt0332280" # The Notebook

In [7]:
target_movie_object = movie_actor_map[target_movie_id]

In [8]:
target_movie_object

{'movie': 'The Incredibles',
 'actors': {'nm0000168', 'nm0000456', 'nm0005134', 'nm0005266'},
 'genres': ['Action', 'Adventure', 'Animation'],
 'rating': 8.0}

## Find the Most Similar Movies by Jaccard Similarity in Actor

In [31]:
movie_similarities = []


def jaccard_similarity (movie1, movie2):
    intersection = len(movie1.intersection(movie2))
    union = len(movie1.union(movie2))
    
    if union != 0:
        return intersection/union
    else:
        return 0

for this_movie_id,this_movie_obj in movie_actor_map.items():
    # Skip the target movie
    if this_movie_id == target_movie_id:
        continue
    

    movie1_actors = set(this_movie_obj['actors'])
    target_actors = set(target_movie_object['actors'])

        
    similarity = jaccard_similarity(movie1_actors, target_actors)        
            
    # Add this movie and its Jaccard similarity to the list, so we can rank at the end
    movie_similarities.append({
        "movie": this_movie_id,
        "jaccard": similarity,
    })

In [32]:
similarity_df = pd.DataFrame(movie_similarities, columns=["movie", "jaccard"])
df_sorted_desc = similarity_df.sort_values('jaccard', ascending=False)

df_sorted_desc

Unnamed: 0,movie,jaccard
13851,tt3606756,0.333333
11215,tt2120176,0.250000
12484,tt2635824,0.250000
16138,tt5804038,0.250000
1390,tt0290831,0.250000
...,...,...
6307,tt11364112,0.000000
6306,tt11362842,0.000000
6305,tt1135989,0.000000
6304,tt1135985,0.000000


In [46]:
first_5_rows = df_sorted_desc.iloc[:5]

first_5_rows

ids = []

for index, row in first_5_rows.iterrows():
    movie_id = row['movie']
    ids.append(movie_id)
    
for index, id in enumerate(ids):
    print(f"Match #{index+1}: {movie_actor_map[id]['movie']}") 


Match #1: Incredibles 2
Match #2: Young Jeezy: A Hustlerz Ambition
Match #3: We Ride: The Story of Snowboarding
Match #4: I Am Not Your Negro
Match #5: Rock That Uke


## Use the top-k similar movies to infer genre

In [None]:
k_nn = 1

In [None]:
similarity_df.sort_values(by="jaccard", ascending=False).head(k_nn)

In [None]:
# TODO: count the genres of the top k movies, printing out the most common
#. genres as the predicted genre for the target movie

In [None]:
this_movie

## Use the top-k similar movies to infer rating

In [None]:
# TODO: Take the average rating from these top-k movies