In [86]:
import numpy as np
import pandas as pd
import json
import math
from sklearn.metrics import DistanceMetric

In [85]:
with open('movies.json') as f:
    
    actorsDict = {}
    actorsList = []
    
    
    for line in f:
        #loads each row
        row = (json.loads(line))
        
        #itterates through actors list in each row
        for name in row['actors']:
            
            #adds name to dict and list if not already in it
            if name[0] not in actorsDict:
                actorsDict[name[0]] = {
                    'actor_id': name[0],
                    'name': name[1]  
                }  
                actorsList.append(actorsDict.get(name[0]))

            #itterates through each genre in the row and a genre key or adds to an existing key
            for genre in row['genres']:
                if genre not in actorsDict[name[0]]:
                    actorsDict[name[0]][genre] = 1
                else:
                    actorsDict[name[0]][genre] += 1
                    
df = pd.DataFrame(actorsList) 

df = df.fillna(0)         
                
df.head()
                  
        

Unnamed: 0,actor_id,name,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
0,nm0000212,Meg Ryan,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,nm0413168,Hugh Jackman,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,nm0000630,Liev Schreiber,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,nm0005227,Breckin Meyer,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
4,nm0864851,Kenneth Tobey,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
df2 = df.iloc[:, 2:].div(df.iloc[:, 3:27].sum(axis=1), axis=0)

df_normalized = pd.concat((df.iloc[:, :2], df2), axis=1)

df_normalized.replace([np.inf, -np.inf], 1, inplace=True)

df_normalized.head()

Unnamed: 0,actor_id,name,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
0,nm0000212,Meg Ryan,0.333333,0.047619,0.285714,0.285714,0.047619,0.095238,0.047619,0.047619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,nm0413168,Hugh Jackman,0.088608,0.037975,0.063291,0.151899,0.063291,0.025316,0.177215,0.050633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,nm0000630,Liev Schreiber,0.126984,0.031746,0.095238,0.222222,0.031746,0.047619,0.063492,0.079365,...,0.047619,0.111111,0.047619,0.015873,0.0,0.0,0.0,0.0,0.0,0.0
3,nm0005227,Breckin Meyer,0.666667,0.066667,0.133333,0.133333,0.0,0.066667,0.066667,0.0,...,0.066667,0.0,0.066667,0.0,0.133333,0.0,0.0,0.0,0.0,0.0
4,nm0864851,Kenneth Tobey,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
df_normalized.loc[df_normalized['name'] == 'Leonardo DiCaprio']

Unnamed: 0,actor_id,name,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
47,nm0000138,Leonardo DiCaprio,0.093023,0.0,0.093023,0.348837,0.023256,0.093023,0.069767,0.093023,...,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0


In [95]:
dist = DistanceMetric.get_metric('euclidean')

dicaprio = 'nm0000138'
dicaprioList = df_normalized.loc[df_normalized['actor_id'] == dicaprio].iloc[:, 2:].values.tolist()

distanceDict = {}

for line in df_normalized.iterrows():
    actorId = line[1]['actor_id'] 
    
    actorsListRaw = df_normalized.loc[df_normalized['actor_id'] == actorId].iloc[:, 2:].values.tolist()
    actorsList = []
        
    for value in actorsListRaw[0]:
        if math.isnan(value):
            actorsList.append(0)
        else:
            actorsList.append(value)

    
    
    X = [dicaprioList[0], actorsList]

    ans = dist.pairwise(X)

    value = ans[0][1]
    
    distanceDict[actorId] = value
    

In [107]:
sorted_list = sorted(distanceDict.items(), key=lambda x: x[1])

top_10 = sorted_list[:11]

final_df_list = []

for id in top_10:
    row = df_normalized.loc[df_normalized['actor_id'] == id[0]].values.tolist()[0]
    final_df_list.append(row)

final_df = pd.DataFrame(final_df_list)

final_df    



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,nm0000138,Leonardo DiCaprio,0.093023,0.0,0.093023,0.348837,0.023256,0.093023,0.069767,0.093023,...,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0
1,nm0290556,James Franco,0.100775,0.031008,0.046512,0.348837,0.023256,0.062016,0.108527,0.085271,...,0.015504,0.007752,0.007752,0.0,0.015504,0.007752,0.0,0.007752,0.0,0.0
2,nm0330687,Joseph Gordon-Levitt,0.096154,0.019231,0.057692,0.307692,0.019231,0.096154,0.115385,0.057692,...,0.0,0.0,0.0,0.0,0.019231,0.0,0.0,0.0,0.0,0.0
3,nm0001618,Joaquin Phoenix,0.09375,0.0,0.078125,0.375,0.03125,0.0625,0.046875,0.078125,...,0.015625,0.03125,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0
4,nm0004778,Adrien Brody,0.114754,0.032787,0.081967,0.311475,0.04918,0.065574,0.081967,0.04918,...,0.032787,0.0,0.0,0.0,0.0,0.032787,0.0,0.0,0.0,0.0
5,nm0000849,Javier Bardem,0.043478,0.021739,0.130435,0.304348,0.021739,0.086957,0.065217,0.086957,...,0.021739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,nm0829576,Kristen Stewart,0.078947,0.052632,0.092105,0.368421,0.026316,0.039474,0.078947,0.065789,...,0.039474,0.0,0.0,0.0,0.0,0.013158,0.0,0.0,0.0,0.0
7,nm0266824,Dakota Fanning,0.08,0.0,0.08,0.34,0.04,0.06,0.06,0.06,...,0.02,0.04,0.02,0.0,0.06,0.04,0.0,0.0,0.0,0.0
8,nm0000368,Laura Dern,0.111111,0.022222,0.088889,0.4,0.044444,0.044444,0.044444,0.133333,...,0.0,0.022222,0.0,0.0,0.022222,0.0,0.0,0.0,0.0,0.0
9,nm0001838,Rachel Weisz,0.094595,0.054054,0.121622,0.297297,0.067568,0.081081,0.081081,0.067568,...,0.013514,0.0,0.0,0.0,0.013514,0.013514,0.0,0.0,0.0,0.0
