In [1]:
import numpy as np
import pandas as pd

from queue import PriorityQueue
import networkx as nx
from pyvis.network import Network

In [2]:
movies_df = pd.read_csv('data/movies_data.csv')
cast_df = pd.read_csv('data/cast_data.csv')
cast_and_movies_df = pd.read_csv('data/cast_and_movies_data.csv')

In [3]:
movies_df.head()

Unnamed: 0,id,title,year
0,15724,Dama de noche,1993
1,23331,Pesn o geroyakh,1983
2,31458,El huésped del sevillano,1970
3,35423,Kate & Leopold,2001
4,36606,"Another Time, Another Place",1983


In [4]:
cast_df.head()

Unnamed: 0,id,name,birth
0,1,Fred Astaire,1899.0
1,2,Lauren Bacall,1924.0
2,3,Brigitte Bardot,1934.0
3,4,John Belushi,1949.0
4,5,Ingmar Bergman,1918.0


In [5]:
cast_and_movies_df.head()

Unnamed: 0,person_id,movie_id
0,844752,15724
1,869732,15724
2,194720,15724
3,650495,15724
4,8738,31458


In [6]:
graph = [tuple(x) for x in cast_and_movies_df.values]

In [7]:
cast_movie_by_id = cast_df.rename(columns={'id': 'person_id'}).drop('birth', axis=1).join(cast_and_movies_df.set_index('person_id'), on='person_id')

cast_movie_by_id = cast_movie_by_id.join(movies_df.rename(columns={'id': 'movie_id'}).set_index('movie_id'), on='movie_id')

In [8]:
cast_movie_by_id

Unnamed: 0,person_id,name,movie_id,title,year
0,1,Fred Astaire,72272.0,That's Entertainment!,1974.0
0,1,Fred Astaire,74130.0,The Amazing Dobermans,1976.0
0,1,Fred Astaire,75323.0,"That's Entertainment, Part II",1976.0
0,1,Fred Astaire,76851.0,The Purple Taxi,1977.0
0,1,Fred Astaire,82449.0,Ghost Story,1981.0
...,...,...,...,...,...
1044494,11077515,Rosemary Sever,8717064.0,Chained the Movie,2018.0
1044495,11077516,Sheerice,8717064.0,Chained the Movie,2018.0
1044496,11077517,Lexie Jose,8717064.0,Chained the Movie,2018.0
1044497,11077518,Sheerice Martinez,,,


# Graph construction

In [9]:
import queue

In [10]:
def minEdgeBFS(edges, u, v, n):
    
    # visited[n] for keeping track
    # of visited node in BFS
    visited = [0] * n

    # Initialize distances as 0
    distance = [0] * n

    # queue to do BFS.
    Q = queue.Queue()
    distance[u] = 0

    Q.put(u)
    visited[u] = True
    while (not Q.empty()):
        x = Q.get()
        
        for i in range(len(edges[x])):
            if (visited[edges[x][i]]):
                continue

            # update distance for i
            distance[edges[x][i]] = distance[x] + 1
            Q.put(edges[x][i])
            visited[edges[x][i]] = 1
    return distance[v]

# function for addition of edge
def addEdge(edges, u, v):
    edges[u].append(v)
    edges[v].append(u)

In [11]:
kevin_bacon_id = 102

In [184]:
kevin_bacon_df = cast_and_movies_df[cast_and_movies_df['person_id'] == kevin_bacon_id]
cast_and_movies_df_without_bacon = cast_and_movies_df[cast_and_movies_df['person_id'] != kevin_bacon_id]

sample_ids = pd.Series(cast_and_movies_df_without_bacon['person_id'].unique()).sample(frac=0.3).unique()

sampled_cast_movies_df = cast_and_movies_df[cast_and_movies_df['person_id'].isin(sample_ids)]
sampled_cast_movies_df = pd.concat([sampled_cast_movies_df, kevin_bacon_df])

In [185]:
sampled_cast_movies_df

Unnamed: 0,person_id,movie_id
0,844752,15724
2,194720,15724
3,650495,15724
5,19177,31458
27,735911,57461
...,...,...
864010,102,3696140
873851,102,3813310
895310,102,4128724
920303,102,4519562


In [186]:
person_nodes = {}
for i, value in enumerate(sampled_cast_movies_df['person_id'].sort_values().unique()):
    person_nodes[i] = value

In [187]:
len(person_nodes)

162210

In [188]:
movie_edges = {}
for i, value in enumerate(sampled_cast_movies_df['movie_id'].sort_values().unique()):
    movie_edges[i] = value

In [189]:
len(movie_edges)

209053

In [190]:
n = len(movie_edges)
edges = [[] for i in range(n)]

for i in sampled_cast_movies_df.values:
    p_node_value = i[0]
    m_edge_value = i[1]

    p_node = list(person_nodes.keys())[list(person_nodes.values()).index(p_node_value)]
    m_edge = list(movie_edges.keys())[list(movie_edges.values()).index(m_edge_value)]
    
    addEdge(edges, m_edge, p_node)

In [280]:
# G = nx.Graph()

# net = Network(notebook=True)

# net.from_nx(G)

# net.show('graph.html')

In [191]:
# pd.DataFrame(edges).to_csv('graph.csv')

In [399]:
# sampled_cast_movies_df.to_csv('data/sampled_cast_and_movies_data.csv', index=False)

# Executing Breadth-First Search (BFS)

In [401]:
sampled_cast_movies_df = pd.read_csv('data/sampled_cast_and_movies_data.csv')

In [192]:
graph = pd.read_csv('graph.csv', sep=',').drop('Unnamed: 0', axis=1)
graph = [graph.iloc[i].dropna().astype(int).to_list() for i in graph.index]

In [391]:
example_id = sampled_cast_movies_df.sample(1).values[0][0]
example_name = cast_df[cast_df['id'] == example_id]['name'].iloc[0]
example_id_graph_node = list(person_nodes.keys())[list(person_nodes.values()).index(example_id)]

In [392]:
kevin_bacon_id_graph_node = list(person_nodes.keys())[list(person_nodes.values()).index(kevin_bacon_id)]

In [393]:
# randomly chosen actor
cast_df[cast_df['id'] == example_id]

Unnamed: 0,id,name,birth
178867,736622,Seth Rogen,1982.0


In [394]:
n = len(graph)
result = minEdgeBFS(graph, example_id_graph_node, kevin_bacon_id_graph_node, n)
print(f"{example_name} is {result} movies away from Kevin Bacon.")

Seth Rogen is 4 movies away from Kevin Bacon.
