In [3]:
import numpy as np
import pandas as pd
import json
import math
from sklearn.metrics import DistanceMetric

In [4]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("movies.json", "r") as in_file:
    for line in in_file:
                this_movie = json.loads(line)
                    
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            actor_genre_map[actor_id] = this_actors_genres
            
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })
                  

In [6]:
index = actor_genre_map.keys()

rows = [actor_genre_map[k] for k in index]

df = pd.DataFrame(rows, index=index)

df = df.fillna(0)

df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
from sklearn.cluster import KMeans


In [8]:
kmeans = KMeans(n_clusters=8)

kmeans.fit(df)



In [12]:
clusters = kmeans.predict(df)
clustered_df = pd.DataFrame(clusters, index=df.index, columns=["cluster"])

clustered_df

Unnamed: 0,cluster
nm0000212,6
nm0413168,1
nm0000630,7
nm0005227,2
nm0864851,0
...,...
nm9504284,0
nm10592896,0
nm7216750,0
nm0936300,0


In [14]:
clustered_df["cluster"].value_counts()

0    29283
3     2583
6      920
2      316
7      259
1      122
5      120
4        6
Name: cluster, dtype: int64

In [16]:
for cluster,actors in clustered_df.groupby("cluster"):
    print("Cluster:", cluster, "Size:", actors.shape[0])
    
    for a_id in actors.sample(5).index:
        print("\t", a_id, actor_name_map[a_id])


Cluster: 0 Size: 29283
	 nm5801319 Paul Amoreno
	 nm10921122 Obada Adnan
	 nm7447307 Demetrius Shipp Jr.
	 nm1479022 Nick E. Tarabay
	 nm1382836 Leah Harrison
Cluster: 1 Size: 122
	 nm0199939 Gary Daniels
	 nm0000169 Tommy Lee Jones
	 nm0621937 Nassar
	 nm0000463 Famke Janssen
	 nm0920460 Vernon Wells
Cluster: 2 Size: 316
	 nm0519043 Justin Long
	 nm0005049 Allison Janney
	 nm1428821 Joey King
	 nm0732497 Craig Robinson
	 nm0001459 Denis Leary
Cluster: 3 Size: 2583
	 nm0949424 Craig Robert Young
	 nm1039877 Tim Draxl
	 nm6845062 Jitendra Kumar
	 nm3194762 Karl Glusman
	 nm0777788 Matthias Schweighöfer
Cluster: 4 Size: 6
	 nm0000514 Michael Madsen
	 nm0001803 Danny Trejo
	 nm0000616 Eric Roberts
	 nm0000246 Bruce Willis
	 nm0000115 Nicolas Cage
Cluster: 5 Size: 120
	 nm4068733 Mel Heflin
	 nm0004760 Jennifer Blanc-Biehn
	 nm0001643 Linnea Quigley
	 nm3748536 Ellie Church
	 nm6271518 Jerry Burkhead
Cluster: 6 Size: 920
	 nm0000456 Holly Hunter
	 nm0550371 Eddie Marsan
	 nm0005318 Carly P