# Exercise 1. Finding the most likely cluster given a particular actor 

This exercise is slightly different from last week. Here you are trying to find the most likely cluster but now it is based on the movie starring a particular actor, instead f being in a certain genre

Download movie_to_cluster.csv Download movie_to_cluster.csv. This file contains all of the movies in the database and the cluster they were sorted into.
Load the csv into Python as a dataframe
Set your target actor as Nicolas Cage (actor ID: nm0000115)
Use a for loop to iterate through the clusters. For each one, calculate the probability that a new movie will fit in that cluster given that it features Nicolas Cage. In other words calculate P(Cluster | nm0000115)
You should get the following output:
Pr[Cluster 00 | nm0000115]: 	 0.0819672131147541

Pr[Cluster 01 | nm0000115]: 	 0.13114754098360656

Pr[Cluster 02 | nm0000115]: 	 0.1639344262295082

Pr[Cluster 03 | nm0000115]: 	 0.09836065573770492

Pr[Cluster 04 | nm0000115]: 	 0.01639344262295082

Pr[Cluster 05 | nm0000115]: 	 0.0

Pr[Cluster 06 | nm0000115]: 	 0.03278688524590164

Pr[Cluster 07 | nm0000115]: 	 0.0

Pr[Cluster 08 | nm0000115]: 	 0.0

Pr[Cluster 09 | nm0000115]: 	 0.04918032786885246

Pr[Cluster 10 | nm0000115]: 	 0.0

Pr[Cluster 11 | nm0000115]: 	 0.03278688524590164

Pr[Cluster 12 | nm0000115]: 	 0.03278688524590164

Pr[Cluster 13 | nm0000115]: 	 0.06557377049180328

Pr[Cluster 14 | nm0000115]: 	 0.01639344262295082

Pr[Cluster 15 | nm0000115]: 	 0.27868852459016397

Repeat steps 3 and 4 for the actors Keanu Reeves (actor ID: nm0000206) and Tom Hiddleston (actor ID: nm1089991)

In [1]:
import json

import pandas as pd
import numpy as np

cluster_df = pd.read_csv("movie_to_cluster.csv")
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })


# Setting the actor we will be comparing to
# target_actor_id = 'nm1165110' # Chris Hemsworth
# target_actor_id = 'nm0413168' # Hugh Jackman
# target_actor_id = 'nm0005351' # Ryan Reynolds
target_actor_id2 = "nm0000206" # Keanu Reeves
target_actor_id = 'nm0000115' # Nic Cage
target_actor_id3 = "nm1089991" # Tom Hiddleston

# For each actor, count the number of movies
actor_counts = {}

# For each movie, get its genres and update the genre count
for movie_id in movie_actor_map.keys():
    for actor in movie_actor_map[movie_id]["actors"]:
        actor_counts[actor] = actor_counts.get(actor, 0) + 1
        
actor_prs = []
for actor,a_count in actor_counts.items():
    actor_prs.append((actor, a_count/len(movie_actor_map)))
    
actor_prs_df = pd.DataFrame(actor_prs, columns=["actor", "probability"])
actor_pr_map = {row["actor"]:row["probability"] for idx,row in actor_prs_df.iterrows()}

actor_prs_df.sort_values(by="probability", ascending=False)

per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_actor_count = sum([
        1 if target_actor_id in movie_actor_map[m]["actors"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability of the target actor given this cluster
    ## Pr(X=Actor | Y=Cluster <ID>)
    pr_actor_given_cluster = this_cluster_actor_count / group.shape[0]

    # Then add this calculated probability to `per_cluster_prs`,
    ## so we can track this conditional probability
    joint_pr_actor_cluster = pr_actor_given_cluster * group.shape[0] / cluster_df.shape[0]
   
    per_cluster_prs.append(joint_pr_actor_cluster)

    # For each cluster ID, calculate the posterior probability given the target actor
pr_target_actor = sum(per_cluster_prs)
for cluster_id,cluster_actor_pr in enumerate(per_cluster_prs):

    # combine this cluster_actor_pr, cluster_pr_df, and actor_prs_df
    ## to calculate Pr(Cluster|Actor)
    pr_cluster_given_actor = cluster_actor_pr / pr_target_actor

    print("\nPr[Cluster %02d | %s]:" % (cluster_id, target_actor_id), "\t", pr_cluster_given_actor)
    
    
    print("\nPr[Cluster %02d | %s]:" % (cluster_id, target_actor_id2), "\t", pr_cluster_given_actor)
    
    print("\nPr[Cluster %02d | %s]:" % (cluster_id, target_actor_id3), "\t", pr_cluster_given_actor)
    


Pr[Cluster 00 | nm0000115]: 	 0.0819672131147541

Pr[Cluster 00 | nm0000206]: 	 0.0819672131147541

Pr[Cluster 00 | nm1089991]: 	 0.0819672131147541

Pr[Cluster 01 | nm0000115]: 	 0.13114754098360656

Pr[Cluster 01 | nm0000206]: 	 0.13114754098360656

Pr[Cluster 01 | nm1089991]: 	 0.13114754098360656

Pr[Cluster 02 | nm0000115]: 	 0.1639344262295082

Pr[Cluster 02 | nm0000206]: 	 0.1639344262295082

Pr[Cluster 02 | nm1089991]: 	 0.1639344262295082

Pr[Cluster 03 | nm0000115]: 	 0.09836065573770492

Pr[Cluster 03 | nm0000206]: 	 0.09836065573770492

Pr[Cluster 03 | nm1089991]: 	 0.09836065573770492

Pr[Cluster 04 | nm0000115]: 	 0.01639344262295082

Pr[Cluster 04 | nm0000206]: 	 0.01639344262295082

Pr[Cluster 04 | nm1089991]: 	 0.01639344262295082

Pr[Cluster 05 | nm0000115]: 	 0.0

Pr[Cluster 05 | nm0000206]: 	 0.0

Pr[Cluster 05 | nm1089991]: 	 0.0

Pr[Cluster 06 | nm0000115]: 	 0.03278688524590164

Pr[Cluster 06 | nm0000206]: 	 0.03278688524590164

Pr[Cluster 06 | nm1089991]: 	 0.0

# Additional Exercise: Finding the Most Likely Actor Given a Particular Cluster (P(Actor | Cluster)) (Optional).

1. Download movie_to_cluster.csv Download movie_to_cluster.csv. This file contains all of the movies in the database and the cluster they were sorted into.

2. Load the csv into Python as a dataframe.

3. Set your target cluster as Cluster0

4. Use a for loop to iterate through the actors(nm0000115, nm0000206, nm1089991 nm0000212, nm0413168) . For each one, calculate P(Actor | Cluster0)

5. Repeat steps 3 and 4 for the Cluster1 and Cluster2. 

In [2]:
per_cluster_prs = []
target_actors = ["nm0000115","nm0000206", "nm1089991", "nm0000212", "nm0413168"]
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_actor_count = sum([
        1 if target_actor_id in movie_actor_map[m]["actors"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability of the target actor given this cluster
    ## Pr(X=Actor | Y=Cluster <ID>)
    pr_actor_given_cluster = this_cluster_actor_count / group.shape[0]
    print("Pr[%s| Cluster %02d]:" % (target_actor_id, cluster_id), "\t", pr_actor_given_cluster)
    
    # Then add this calculated probability to `per_cluster_prs`,
    ## so we can track this conditional probability
    joint_pr_actor_cluster = pr_actor_given_cluster * group.shape[0] / cluster_df.shape[0]
    print("Pr[%s, Cluster %02d]:" % (target_actor_id, cluster_id), "\t", joint_pr_actor_cluster)
    per_cluster_prs.append(joint_pr_actor_cluster)

Pr[nm0000115| Cluster 00]: 	 0.0016144656118824668
Pr[nm0000115, Cluster 00]: 	 0.00024248302618816683
Pr[nm0000115| Cluster 01]: 	 0.005813953488372093
Pr[nm0000115, Cluster 01]: 	 0.0003879728419010669
Pr[nm0000115| Cluster 02]: 	 0.00665335994677312
Pr[nm0000115, Cluster 02]: 	 0.00048496605237633366
Pr[nm0000115| Cluster 03]: 	 0.004838709677419355
Pr[nm0000115, Cluster 03]: 	 0.0002909796314258002
Pr[nm0000115| Cluster 04]: 	 0.0015267175572519084
Pr[nm0000115, Cluster 04]: 	 4.8496605237633365e-05
Pr[nm0000115| Cluster 05]: 	 0.0
Pr[nm0000115, Cluster 05]: 	 0.0
Pr[nm0000115| Cluster 06]: 	 0.0006295247088448222
Pr[nm0000115, Cluster 06]: 	 9.699321047526673e-05
Pr[nm0000115| Cluster 07]: 	 0.0
Pr[nm0000115, Cluster 07]: 	 0.0
Pr[nm0000115| Cluster 08]: 	 0.0
Pr[nm0000115, Cluster 08]: 	 0.0
Pr[nm0000115| Cluster 09]: 	 0.0078125
Pr[nm0000115, Cluster 09]: 	 0.0001454898157129001
Pr[nm0000115| Cluster 10]: 	 0.0
Pr[nm0000115, Cluster 10]: 	 0.0
Pr[nm0000115| Cluster 11]: 	 0.0031