In [11]:
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline

# 1

In [12]:
actor_name_map = {}
movie_actor_map = {}

with open("imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            actor_name_map[actor_id] = actor_name
            
        # finished with this film
        movie_actor_map[this_movie["title_id"]] = ({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids']),
            "genres": this_movie["title_genre"]
        })

In [13]:
df = pd.DataFrame(movie_actor_map.values(), index=movie_actor_map.keys())
df[df["movie"] == "Gladiator"]

Unnamed: 0,movie,actors,genres
tt0172495,Gladiator,"{nm0000128, nm0001657, nm0001618}","[Action, Adventure, Drama]"


In [14]:
target_movie_id = "tt0172495"

In [15]:
target_movie = movie_actor_map[target_movie_id]
target_movie

{'movie': 'Gladiator',
 'actors': {'nm0000128', 'nm0001618', 'nm0001657'},
 'genres': ['Action', 'Adventure', 'Drama']}

In [16]:
distances = []

target_actors = target_movie["actors"]
for movie in movie_actor_map.values():
    these_actors = movie["actors"]
    
    numer = len(target_actors.intersection(these_actors))
    denom = len(target_actors.union(these_actors))
    
    jaccard_sim = numer / denom
    
    distances.append({
        "movie": movie,
        "similarity": jaccard_sim
    })

In [17]:
for similar_movie in sorted(distances, key=lambda x: x["similarity"], reverse=True)[:10]:
    print(similar_movie["movie"]["movie"], similar_movie["similarity"])
    for actor in similar_movie["movie"]["actors"]:
        print("\t", actor_name_map[actor])

Gladiator 1.0
	 Russell Crowe
	 Oliver Reed
	 Joaquin Phoenix
Earthlings 0.3333333333333333
	 Joaquin Phoenix
Two Lovers 0.3333333333333333
	 Joaquin Phoenix
Her 0.3333333333333333
	 Joaquin Phoenix
Red Obsession 0.3333333333333333
	 Russell Crowe
Dominion 0.3333333333333333
	 Joaquin Phoenix
Turtle Odyssey 0.3333333333333333
	 Russell Crowe
Proof of Life 0.25
	 David Morse
	 Russell Crowe
Walk the Line 0.25
	 Robert Patrick
	 Joaquin Phoenix
The Village 0.25
	 William Hurt
	 Joaquin Phoenix


# 2

In [18]:
from scipy.sparse import lil_matrix # needed for building the matrix of user ratings
import scipy.spatial.distance # needed for calculating pairwise distances

In [19]:
known_movies = set()

user_ratings = {} # list of all our movie ratings for specific users
movie_ids = []

with open("user_ratings.json", "r") as in_file:
    for line in in_file:
        
        this_rating = json.loads(line)
        
        known_movies.add(this_rating["title_id"])
        
        # maintain a list of movie ratings we've seen
        if this_rating["title_id"] not in movie_ids:
            movie_ids.append(this_rating["title_id"])
        
        # maintain a list of which movies a user has rated
        # check to see if we've seen this user before, and if so, 
        # pull the list of (movie id, rating) tuples. Otherwise,
        # create an empty list
        this_users_ratings = user_ratings.get(this_rating["userId"], [])
        this_users_ratings.append((this_rating["title_id"], this_rating["rating"]))
        
        user_ratings[this_rating["userId"]] = this_users_ratings

In [20]:
# map IMDB movie IDs to numeric indices
movie_id_to_index = {m:i for i,m in enumerate(movie_ids)}

In [21]:
len(known_movies), len(user_ratings)

(4465, 2244)

In [22]:
# with sparse matrix, initialize to size of Users x Movies of 0s
matrix_sparse = lil_matrix((len(user_ratings), len(known_movies)), dtype=float)

# update the matrix, user by user, setting non-zero values for the appropriate actors
for row,this_user in enumerate(user_ratings): 
    this_user_ratings = user_ratings[this_user]
    
    for movie_id,rating in this_user_ratings:
        this_movie_index = movie_id_to_index[movie_id]
        matrix_sparse[row,this_movie_index] = rating

In [23]:
df = pd.DataFrame.sparse.from_spmatrix(
    matrix_sparse, 
    index=[u for u in user_ratings],
    columns=movie_ids
).T
df

Unnamed: 0,10,37,51,126,152,263,284,448,626,706,...,162002,162073,162207,162257,162363,162420,162434,162464,162499,162537
tt0274309,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0298203,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0315733,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0337563,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0463854,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt4241904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
tt1666800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt6806448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
tt0844671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [24]:
target_movie_ratings = df.loc[target_movie_id]
distances = scipy.spatial.distance.cdist(df, [target_movie_ratings], metric="cosine")[:,0]

query_distances = list(zip(df.index, distances))

In [38]:
for similar_movie_id, similar_movie_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    similar_movie = movie_actor_map[similar_movie_id]
    print(similar_movie["movie"], similar_movie_score)

    for actor in similar_movie["actors"]:
        print("\t", actor_name_map[actor])

Gladiator 0.0
	 Russell Crowe
	 Oliver Reed
	 Joaquin Phoenix
X-Men 0.43202624709295856
	 Ian McKellen
	 Patrick Stewart
	 Hugh Jackman
Shrek 0.4401238227206511
	 Eddie Murphy
	 Mike Myers
	 John Lithgow
Batman Begins 0.45839113448840574
	 Christian Bale
	 Ken Watanabe
	 Michael Caine
	 Liam Neeson
Ocean's Eleven 0.4588399504816394
	 George Clooney
	 Brad Pitt
	 Matt Damon
Minority Report 0.47656616190270473
	 Colin Farrell
	 Max von Sydow
	 Tom Cruise
Spider-Man 0.48087647221357543
	 Willem Dafoe
	 Tobey Maguire
	 James Franco
Kill Bill: Vol. 1 0.49479133060082814
	 Michael Madsen
	 David Carradine
Cast Away 0.4955520050304495
	 Tom Hanks
	 Paul Sanchez
Memento 0.5004634183376183
	 Mark Boone Junior
	 Guy Pearce
	 Joe Pantoliano


# 3

In [26]:
actor_name_map = {}
movie_actor_map = {}

with open("imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            actor_name_map[actor_id] = actor_name
            
        # finished with this film
        movie_actor_map[this_movie["title_id"]] = ({
            "movie": this_movie["title_name"],
            "actors": set(this_movie['actor_ids']),
            "genres": this_movie["title_genre"]
        })

In [33]:
df = pd.DataFrame(movie_actor_map.values(), index=movie_actor_map.keys())
df[df["movie"] == "Her"]

Unnamed: 0,movie,actors,genres
tt1347522,Her,"{nm3264298, nm0019174, nm3263461}",[Drama]
tt1798709,Her,{nm0001618},"[Drama, Romance, Sci-Fi]"


In [34]:
target_movie_id = "tt1798709"

In [35]:
target_movie = movie_actor_map[target_movie_id]
target_movie

{'movie': 'Her',
 'actors': {'nm0001618'},
 'genres': ['Drama', 'Romance', 'Sci-Fi']}

In [36]:
distances = []

target_actors = target_movie["actors"]
for movie in movie_actor_map.values():
    these_actors = movie["actors"]
    
    numer = len(target_actors.intersection(these_actors))
    denom = len(target_actors.union(these_actors))
    
    jaccard_sim = numer / denom
    
    distances.append({
        "movie": movie,
        "similarity": jaccard_sim
    })

In [37]:
for similar_movie in sorted(distances, key=lambda x: x["similarity"], reverse=True)[:10]:
    print(similar_movie["movie"]["movie"], similar_movie["similarity"])
    for actor in similar_movie["movie"]["actors"]:
        print("\t", actor_name_map[actor])

Earthlings 1.0
	 Joaquin Phoenix
Two Lovers 1.0
	 Joaquin Phoenix
Her 1.0
	 Joaquin Phoenix
Dominion 1.0
	 Joaquin Phoenix
Walk the Line 0.5
	 Robert Patrick
	 Joaquin Phoenix
The Village 0.5
	 William Hurt
	 Joaquin Phoenix
Reservation Road 0.5
	 Mark Ruffalo
	 Joaquin Phoenix
The Immigrant 0.5
	 Jeremy Renner
	 Joaquin Phoenix
Irrational Man 0.5
	 Joe Stapleton
	 Joaquin Phoenix
You Were Never Really Here 0.5
	 John Doman
	 Joaquin Phoenix
