# HW3 - Social Network Analysis

## Import modules

In [1]:
import math
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
from tqdm import tqdm

## Useful functions

In [2]:
k=10

def print_dict(d, k=10):
    for item in list(d.items())[:k]:
        print(item)
        
def lists_overlap(l1, l2):
    return bool(set(l1) & set(l2))

## Load the data

In [3]:
casts = pd.read_csv('./../data/casts.csv', error_bad_lines = False, sep=';')

## Convert „casts“ data to a graph

In [None]:
G = nx.Graph()

# Print casts CSV column headers
print(casts.columns.tolist())

# Group movies by actors
movies_by_actor = casts.groupby('actor')['movie']
# movies_by_actor = casts[casts['actor_type'] < 'AA90'].groupby('actor')['movie']
movies_by_actor_dict = movies_by_actor.apply(list).to_dict()
# print(len(movies_by_actor_dict))
# print(movies_by_actor_dict['A.E. Matthews'])
# print(movies_by_actor_dict)

# Filter the supporting actor
if 's a' in movies_by_actor_dict:
    del movies_by_actor_dict['s a']

# Filter numeric actor values
# Using dictionary comprehension to find list
# Get numeric actors
delete = [key for key in movies_by_actor_dict if key.isnumeric()]

# Delete the key
for key in delete: del movies_by_actor_dict[key]
        
# Create the graph
for actor in movies_by_actor_dict.keys():
    G.add_node(actor)
        
# Test if two lists overlaps
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['David Tree'])))
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['Athene Seyler'])))

for actor1, movies1 in tqdm(movies_by_actor_dict.items(), total=len(G.nodes())):
    for actor2, movies2 in movies_by_actor_dict.items():
        if actor1 != actor2 and lists_overlap(movies1, movies2):
            G.add_edge(actor1, actor2)

['actor_type', 'movie', 'actor', 'role_type', 'role']


 61%|██████▏   | 10203/16610 [03:16<03:16, 32.64it/s]

## Dataset general statistics

In [None]:
def general_statistics(graph, visualize_components=False):
    n = len(graph.nodes())
    e = len(graph.edges())
    print('Number of nodes: ', n)
    print('Number of edges: ', e)
    print('Density: ', e / (n*(n-1)/2))
    components = list(nx.connected_components(graph))
    if visualize_components:
        pos = graphviz_layout(graph)
        nx.draw(graph, pos, with_labels=False, node_size=10)
    print('Number of components: ', len(components))
    
general_statistics(G)

## Centralities

In [None]:
def compute_centralities(graph, centralities, k):
    for centrality in tqdm(centralities, total=len(centralities)):
        print('Top {} players by {}:'.format(k, centrality.__name__))
        c_dict = centrality(graph)
        c_dict = dict(sorted(c_dict.items(), key=lambda item: item[1], reverse=True))
        for item in list(c_dict.items()):
            graph.nodes[item[0]][centrality.__name__] = item[1]
        print_dict(c_dict)
        print('\n')
    return graph

G = compute_centralities(G, [nx.degree_centrality, nx.closeness_centrality, nx.betweenness_centrality, nx.eigenvector_centrality], k)
# G = compute_centralities(G, [nx.degree_centrality], k)
# print(G.nodes['Humphrey Bogart'])

## Communities

In [None]:
communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(G,59)) for node in community}
print('Largest community has size of {} nodes:'.format(len(communities)))
print_dict(communities, len(communities))
print('\n')

communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(G,35)) for node in community}
print('Communities with at least 35 nodes:')
print_dict(communities, len(communities))

## Kevin Bacon numbers

In [None]:
def kevin_bacon_numbers(graph):
    for a in graph.nodes():
        try:
            path = nx.shortest_path(graph,source=a,target='Kevin Bacon')
            graph.nodes[a]['kevin_bacon_number'] = int(len(path)/2)
        except nx.NetworkXNoPath:
            graph.nodes[a]['kevin_bacon_number'] = len(graph.nodes())
#         print('{0}: {1}'.format(a, graph.nodes[a]))
    return graph

print('Top {} actors with Kevin Bacon number:'.format(k))
G = kevin_bacon_numbers(G)
# print(G.nodes['Humphrey Bogart'])


#=== Top k actors by Kevin Bacon number (including infinite KB number - actors wign non-existing path to KB)
# Sort actors by Kevin Bacon number
kevin_bacon_desc = dict(sorted(dict(G.nodes(data=True)).items(), key=lambda item: item[1]['kevin_bacon_number'], reverse=True))
print_dict(kevin_bacon_desc)
print('\n')


#=== Top k actors by Kevin Bacon number (finite only)
kevin_bacon_decs_fin = {}

# Filter actors by finite Kevin Bacon number
to_add = [key for key in kevin_bacon_desc if kevin_bacon_desc[key]['kevin_bacon_number'] is not math.inf]
for key in to_add:
    kevin_bacon_decs_fin[key] = kevin_bacon_desc[key]
    
print('Top {} actors with finite Kevin Bacon number:'.format(k))
print_dict(kevin_bacon_decs_fin)
print('\n')


#=== Bottom k actors by Kevin Bacon number
print('Bottom {} actors with finite Kevin Bacon number:'.format(k))
kevin_bacon_asc_fin = dict(sorted(kevin_bacon_decs_fin.items(), key=lambda item: item[1]['kevin_bacon_number']))
print_dict(kevin_bacon_asc_fin)
print('\n')

#=== Average Kevin Bacon number (finite numbers only)
sum = 0
for item in kevin_bacon_asc_fin.items():
    sum += item[1]['kevin_bacon_number']
kv_avg = sum/len(kevin_bacon_asc_fin)
print('Average Kevin Bacon number: {}'.format(kv_avg))

## Save into GEXF format

In [None]:
nx.write_gexf(G, './../results/actors_casts.gexf')

## Dataset reduction

In [None]:
movies_casts_cnt = casts['movie'].value_counts().to_dict()
print(len(movies_casts_cnt))
# print(movies_casts_cnt)

movies_reduced = list(k for (k, v) in movies_casts_cnt.items() if (v < 5))
print(len(movies_reduced))
# print(movies_reduced)

movies_by_actor_reduced_dict = {}

for key, val in movies_by_actor_dict.items():
    if lists_overlap(val, movies_reduced):
        movies_by_actor_reduced_dict[key] = val

print(len(movies_by_actor_reduced_dict))
# print(movies_by_actor_reduced_dict)

## Construct reduced graph

In [None]:
# Create the graph
G_reduced = nx.Graph()

for actor in movies_by_actor_reduced_dict.keys():
    G_reduced.add_node(actor)

for actor1, movies1 in tqdm(movies_by_actor_reduced_dict.items(), total=len(G_reduced.nodes())):
    for actor2, movies2 in movies_by_actor_reduced_dict.items():
        if actor1 != actor2 and lists_overlap(movies1, movies2):
            G_reduced.add_edge(actor1, actor2)

## Reduced graph general statistics

In [None]:
general_statistics(G_reduced)

## Reduced Centralities

In [None]:
G_reduced = compute_centralities(G_reduced, [nx.degree_centrality, nx.closeness_centrality, nx.betweenness_centrality, nx.eigenvector_centrality], k)

## Reduced communities

In [None]:
communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(G_reduced,20)) for node in community}
print('Communities with at least 20 nodes:')
print_dict(communities, len(communities))

for key, val in G_reduced.nodes().items():
    if key in communities:
        G_reduced.nodes[key]['community'] = communities[key]
    else:
        G_reduced.nodes[key]['community'] = 0

## Reduced Kevin Bacon numbers

In [None]:
G_reduced = kevin_bacon_numbers(G_reduced)
print_dict(G_reduced.nodes())

## Save into GEFX format

In [None]:
nx.write_gexf(G_reduced, './../results/actors_casts_reduced.gexf')

## Testing attributes

In [None]:
print(G_reduced.nodes['Matthew Settle'])