# HW3 - Social Network Analysis

## Import modules

In [1]:
import math
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
from tqdm import tqdm

## Useful functions

In [98]:
k=10

def print_dict(d, k=10):
    for item in list(d.items())[:k]:
        print(item)
        
def lists_overlap(l1, l2):
    return bool(set(l1) & set(l2))

## Load the data

In [2]:
casts = pd.read_csv('./../data/casts.csv', error_bad_lines = False, sep=';')

## Convert „casts“ data to a graph

In [3]:
G = nx.Graph()

# Print casts CSV column headers
print(casts.columns.tolist())

# Group movies by actors
movies_by_actor = casts.groupby('actor')['movie']
# movies_by_actor = casts[casts['actor_type'] < 'AA90'].groupby('actor')['movie']
movies_by_actor_dict = movies_by_actor.apply(list).to_dict()
# print(len(movies_by_actor_dict))
# print(movies_by_actor_dict['A.E. Matthews'])
# print(movies_by_actor_dict)

# Filter the supporting actor
if 's a' in movies_by_actor_dict:
    del movies_by_actor_dict['s a']

# Filter numeric actor values
# Using dictionary comprehension to find list
# Get numeric actors
delete = [key for key in movies_by_actor_dict if key.isnumeric()]

# Delete the key
for key in delete: del movies_by_actor_dict[key]
        
# Create the graph
for actor in movies_by_actor_dict.keys():
    G.add_node(actor)
        
# Test if two lists overlaps
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['David Tree'])))
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['Athene Seyler'])))

for actor1, movies1 in tqdm(movies_by_actor_dict.items(), total=len(G.nodes())):
    for actor2, movies2 in movies_by_actor_dict.items():
        if actor1 != actor2 and lists_overlap(movies1, movies2):
            G.add_edge(actor1, actor2)

['actor_type', 'movie', 'actor', 'role_type', 'role']


100%|██████████| 16610/16610 [02:55<00:00, 94.64it/s] 


## Dataset general statistics

In [121]:
def general_statistics(graph, visualize_components=False):
    n = len(graph.nodes())
    e = len(graph.edges())
    print('Number of nodes: ', n)
    print('Number of edges: ', e)
    print('Density: ', e / (n*(n-1)/2))
    components = list(nx.connected_components(graph))
    if visualize_components:
        pos = graphviz_layout(graph)
        nx.draw(graph, pos, with_labels=False, node_size=10)
    print('Number of components: ', len(components))
    
general_statistics(G)

Number of nodes:  16610
Number of edges:  152251
Density:  0.0011037660504019404
Number of components:  748


## Centralities

In [139]:
def compute_centralities(graph, centralities, k):
    for centrality in tqdm(centralities, total=len(centralities)):
        print('Top {} players by {}:'.format(k, centrality.__name__))
        c_dict = centrality(graph)
        c_dict = dict(sorted(c_dict.items(), key=lambda item: item[1], reverse=True))
        for item in list(c_dict.items()):
            graph.nodes[item[0]][centrality.__name__] = item[1]
        print_dict(c_dict)
        print('\n')
    return graph

G = compute_centralities(G, [nx.degree_centrality, nx.closeness_centrality, nx.betweenness_centrality, nx.eigenvector_centrality], k)
# G = compute_centralities(G, [nx.degree_centrality], k)
# print(G.nodes['Humphrey Bogart'])

## Communities

In [71]:
communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(G,59)) for node in community}
print('Largest community has size of {} nodes:'.format(len(communities)))
print_dict(communities, len(communities))
print('\n')

communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(G,35)) for node in community}
print('Communities with at least 35 nodes:')
print_dict(communities, len(communities))

Largest community has size of 59 nodes:
('Beverly Bane', 1)
('Ralph Forbes', 1)
('Laurence Harvey', 1)
('Gulio Garbinetti', 1)
('Edna May Oliver', 1)
('John McEnery', 1)
('John Gielgud', 1)
('Paul Hardwick', 1)
('Robert Warwick', 1)
('Milo OShea', 1)
('Paul Panzer', 1)
('Susan Shentall', 1)
('Mario Caserini', 1)
('Rosemarie Dexter', 1)
('C.Aubrey Smith', 1)
('Gustav Serena', 1)
('Virginia Hammond', 1)
('Henry Kolker', 1)
('Francis X. Bushman', 1)
('Leslie Howard', 1)
('Enzo Fiermonte', 1)
('Basil Rathbone', 1)
('Francesca Bertini', 1)
('Olivia Hussey', 1)
('Theda Bara', 1)
('Sir Godrey Teale', 1)
('Antionio Pierfrederici', 1)
('Lela Mourad', 1)
('Norman Wooland', 1)
('Violet KembleCooper', 1)
('Julia M. Taylor', 1)
('Ibrahim Hamouda', 1)
('Norma Shearer', 1)
('td> Claire Danes<', 1)
('Laurence Olivier', 1)
('Natasha Peryy', 1)
('Sebastian Cabot', 1)
('Maria Gasperini', 1)
('Flora Robson', 1)
('Lydia Sherwood', 1)
('Nietta Zocchi', 1)
('Mary Malone', 1)
('Esmeralda Ruspoli', 1)
('Aldo Z

## Kevin Bacon numbers

In [133]:
def kevin_bacon_numbers(graph):
    for a in graph.nodes():
        try:
            path = nx.shortest_path(graph,source=a,target='Kevin Bacon')
            graph.nodes[a]['kevin_bacon_number'] = int(len(path)/2)
        except nx.NetworkXNoPath:
            graph.nodes[a]['kevin_bacon_number'] = len(graph.nodes())
#         print('{0}: {1}'.format(a, graph.nodes[a]))
    return graph

print('Top {} actors with Kevin Bacon number:'.format(k))
G = kevin_bacon_numbers(G)
# print(G.nodes['Humphrey Bogart'])


#=== Top k actors by Kevin Bacon number (including infinite KB number - actors wign non-existing path to KB)
# Sort actors by Kevin Bacon number
kevin_bacon_desc = dict(sorted(dict(G.nodes(data=True)).items(), key=lambda item: item[1]['kevin_bacon_number'], reverse=True))
print_dict(kevin_bacon_desc)
print('\n')


#=== Top k actors by Kevin Bacon number (finite only)
kevin_bacon_decs_fin = {}

# Filter actors by finite Kevin Bacon number
to_add = [key for key in kevin_bacon_desc if kevin_bacon_desc[key]['kevin_bacon_number'] is not math.inf]
for key in to_add:
    kevin_bacon_decs_fin[key] = kevin_bacon_desc[key]
    
print('Top {} actors with finite Kevin Bacon number:'.format(k))
print_dict(kevin_bacon_decs_fin)
print('\n')


#=== Bottom k actors by Kevin Bacon number
print('Bottom {} actors with finite Kevin Bacon number:'.format(k))
kevin_bacon_asc_fin = dict(sorted(kevin_bacon_decs_fin.items(), key=lambda item: item[1]['kevin_bacon_number']))
print_dict(kevin_bacon_asc_fin)
print('\n')

#=== Average Kevin Bacon number (finite numbers only)
sum = 0
for item in kevin_bacon_asc_fin.items():
    sum += item[1]['kevin_bacon_number']
kv_avg = sum/len(kevin_bacon_asc_fin)
print('Average Kevin Bacon number: {}'.format(kv_avg))

Top 10 actors with Kevin Bacon number:
('Abel Gance', {'degree_centrality': 0.00018062496236979952, 'kevin_bacon_number': 16610, 'closeness_centrality': 0.00018062496236979952, 'betweenness_centrality': 0.0, 'eigenvector_centrality': 3.674385679978072e-21})
('Abel Salazar', {'degree_centrality': 6.020832078993317e-05, 'kevin_bacon_number': 16610, 'closeness_centrality': 6.020832078993317e-05, 'betweenness_centrality': 0.0, 'eigenvector_centrality': 2.2426670410022412e-25})
('Abishek Kapoor', {'degree_centrality': 6.020832078993317e-05, 'kevin_bacon_number': 16610, 'closeness_centrality': 6.020832078993317e-05, 'betweenness_centrality': 0.0, 'eigenvector_centrality': 2.2426670410022412e-25})
('Adolfas Mekas', {'degree_centrality': 0.00024083328315973267, 'kevin_bacon_number': 16610, 'closeness_centrality': 0.00024083328315973267, 'betweenness_centrality': 0.0, 'eigenvector_centrality': 8.354585770525938e-20})
('Adolph Gance', {'degree_centrality': 0.0, 'kevin_bacon_number': 16610, 'clos

## Save into GEXF format

In [135]:
nx.write_gexf(G, './../results/actors_casts.gexf')

## Dataset reduction

In [161]:
movies_casts_cnt = casts['movie'].value_counts().to_dict()
print(len(movies_casts_cnt))
# print(movies_casts_cnt)

movies_reduced = list(k for (k, v) in movies_casts_cnt.items() if (v < 5))
print(len(movies_reduced))
# print(movies_reduced)

movies_by_actor_reduced_dict = {}

for key, val in movies_by_actor_dict.items():
    if lists_overlap(val, movies_reduced):
        movies_by_actor_reduced_dict[key] = val

print(len(movies_by_actor_reduced_dict))
# print(movies_by_actor_reduced_dict)

8632
4356
4853


## Construct reduced graph

In [162]:
# Create the graph
G_reduced = nx.Graph()

for actor in movies_by_actor_reduced_dict.keys():
    G_reduced.add_node(actor)

for actor1, movies1 in tqdm(movies_by_actor_reduced_dict.items(), total=len(G_reduced.nodes())):
    for actor2, movies2 in movies_by_actor_reduced_dict.items():
        if actor1 != actor2 and lists_overlap(movies1, movies2):
            G_reduced.add_edge(actor1, actor2)

100%|██████████| 4853/4853 [00:23<00:00, 202.93it/s]


## Reduced graph general statistics

In [164]:
general_statistics(G_reduced)

Number of nodes:  4853
Number of edges:  43510
Density:  0.003695625843322112
Number of components:  15


## Reduced Centralities

In [166]:
G_reduced = compute_centralities(G_reduced, [nx.degree_centrality, nx.closeness_centrality, nx.betweenness_centrality, nx.eigenvector_centrality], k)

  0%|          | 0/4 [00:00<?, ?it/s]

Top 10 players by degree_centrality:
('Charlton Heston', 0.041220115416323165)
('Burt Lancaster', 0.03977741137675186)
('John Carradine', 0.03977741137675186)
('Henry Fonda', 0.03957131079967024)
('James Stewart', 0.03957131079967024)
('John Gielgud', 0.038953009068425394)
('Gary Cooper', 0.03812860676009893)
('Humphrey Bogart', 0.036892003297609235)
('Gene Hackman', 0.036273701566364384)
('C.Aubrey Smith', 0.03503709810387469)


Top 10 players by closeness_centrality:


 50%|█████     | 2/4 [00:37<00:37, 18.79s/it]

('Burt Lancaster', 0.38989386465864234)
('Charlton Heston', 0.38828273298649923)
('John Gielgud', 0.38721602218159124)
('Robert Mitchum', 0.381725783855295)
('Henry Fonda', 0.3814523195510127)
('Roddy McDowall', 0.37930865050810303)
('John Carradine', 0.3781115267886518)
('Gene Hackman', 0.3772780263359607)
('Martin Balsam', 0.3768626526045434)
('Laurence Olivier', 0.37683301804823255)


Top 10 players by betweenness_centrality:


 75%|███████▌  | 3/4 [03:40<01:26, 86.99s/it]

('Burt Lancaster', 0.01782220759938391)
('John Gielgud', 0.01593126725978241)
('Robert deNiro', 0.015505589377468884)
('Gene Hackman', 0.015041411402960446)
('Charlton Heston', 0.014130679273452252)
('Jack Nicholson', 0.012773269113655084)
('Ingrid Bergman', 0.012571431495683127)
('John Carradine', 0.012239197378249623)
('Max vonSydow', 0.011584440028305467)
('Sean Connery', 0.01148359435247888)


Top 10 players by eigenvector_centrality:


100%|██████████| 4/4 [03:40<00:00, 55.13s/it]

('John Carradine', 0.11467320567694804)
('Henry Fonda', 0.10901593912198232)
('James Stewart', 0.10433269974281009)
('C.Aubrey Smith', 0.10339504438273409)
('David Niven', 0.10139095210817221)
('Charlton Heston', 0.09939158317938407)
('Gary Cooper', 0.09491106954410376)
('John Gielgud', 0.09016101051755372)
('Peter Lorre', 0.08822401953823208)
('Humphrey Bogart', 0.0855589280071451)







## Reduced communities

In [197]:
communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(G_reduced,20)) for node in community}
print('Communities with at least 20 nodes:')
print_dict(communities, len(communities))

for key, val in G_reduced.nodes().items():
    if key in communities:
        G_reduced.nodes[key]['community'] = communities[key]
    else:
        G_reduced.nodes[key]['community'] = 0

Communities with at least 20 nodes:
('Fred Ward', 1)
('Whoopie Goldberg', 1)
('Anjelica Huston', 1)
('Jack Lemmon', 1)
('Lyle Lovett', 1)
('Dina Merrill', 1)
('Bruce Willis', 1)
('Dustin Hoffman', 1)
('Buck Henry', 1)
('Greta Scacchi', 1)
('Burt Reynolds', 1)
('Dean Stockwell', 1)
('Susan Sarandon', 1)
('Peter Gallagher', 1)
('Cher', 1)
('Vincent dOnofrio', 1)
('Andie McDowell', 1)
('Nick Nolte', 1)
('Tim Robbins', 1)
('Julia Roberts', 1)
('Robert Morley', 3)
('John Mills', 2)
('Evelyn Keyes', 2)
('Tim McCoy', 2)
('Frank Sinatra', 2)
('Joe E. Brown', 2)
('John Gielgud', 3)
('John Carradine', 2)
('Fernandel', 2)
('Basil Sidney', 2)
('Glynis Johns', 2)
('Robert Newton', 2)
('Buster Keaton', 2)
('Charles Boyer', 2)
('Reginald Denny', 2)
('Ronald Colman', 2)
('Marlene Dietrich', 2)
('Trevor Howard', 2)
('Gilbert Roland', 2)
('David Niven', 2)
('Cesar Romero', 2)
('Victor McLaglen', 2)
('Peter Lorre', 2)
('George Raft', 2)
('Red Skelton', 2)
('R.G. Armstrong', 3)
('Robert Hardy', 3)
('David

## Reduced Kevin Bacon numbers

In [198]:
G_reduced = kevin_bacon_numbers(G_reduced)
print_dict(G_reduced.nodes())

('58 Plymouth Fury', {'kevin_bacon_number': 2, 'degree_centrality': 0.00020610057708161583, 'closeness_centrality': 0.2248691047120911, 'betweenness_centrality': 0.0, 'eigenvector_centrality': 6.383975775530849e-05, 'community': 0})
('Aaron Schwartz', {'kevin_bacon_number': 2, 'degree_centrality': 0.00041220115416323167, 'closeness_centrality': 0.24083754601062282, 'betweenness_centrality': 0.0, 'eigenvector_centrality': 0.00011137954845097152, 'community': 0})
('Abraham Sofaer', {'kevin_bacon_number': 2, 'degree_centrality': 0.00494641384995878, 'closeness_centrality': 0.3126221860864618, 'betweenness_centrality': 0.0008786192072790573, 'eigenvector_centrality': 0.012217450671431878, 'community': 0})
('Acquanetta', {'kevin_bacon_number': 2, 'degree_centrality': 0.0016488046166529267, 'closeness_centrality': 0.2839477093393004, 'betweenness_centrality': 2.0438530390791752e-05, 'eigenvector_centrality': 0.003986695691544215, 'community': 0})
('Adam Ant', {'kevin_bacon_number': 2, 'degre

## Save into GEFX format

In [199]:
nx.write_gexf(G_reduced, './../results/actors_casts_reduced.gexf')

## Testing attributes

In [200]:
print(G_reduced.nodes['Matthew Settle'])

{'kevin_bacon_number': 4853, 'degree_centrality': 0.0006183017312448475, 'closeness_centrality': 0.0006183017312448475, 'betweenness_centrality': 0.0, 'eigenvector_centrality': 1.974204310143018e-21, 'community': 0}
