# HW3 - Social Network Analysis

## Import modules

In [50]:
import pandas as pd
import nltk
import networkx as nx
import matplotlib.pyplot as plt
from networkx.drawing.nx_agraph import graphviz_layout
from tqdm import tqdm

## Load the data

In [2]:
casts = pd.read_csv('./../data/casts.csv', error_bad_lines = False, sep=';')

## Convert „casts“ data to a graph

In [45]:
G = nx.Graph()

# Print casts CSV column headers
print(casts.columns.tolist())

# Group movies by actors
# movies_by_actor = casts[casts['actor_type'] < 'AA90'].groupby('actor')['movie']
movies_by_actor = casts.groupby('actor')['movie']
movies_by_actor_dict = movies_by_actor.apply(list).to_dict()
# print(len(movies_by_actor_dict))
# print(movies_by_actor_dict['A.E. Matthews'])
# print(movies_by_actor_dict)

del movies_by_actor_dict['s a']

# Create the graph
for actor in movies_by_actor_dict.keys():
    G.add_node(actor)

# Test if two lists overlaps
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['David Tree'])))
# print(bool(set(movies_by_actor_dict['A.E. Matthews']) & set(movies_by_actor_dict['Athene Seyler'])))

for actor1, movies1 in tqdm(movies_by_actor_dict.items(), total=len(movies_by_actor_dict)):
    for actor2, movies2 in movies_by_actor_dict.items():
        if actor1 != actor2 and bool(set(movies1) & set(movies2)):
            G.add_edge(actor1, actor2)

nx.write_gexf(G, './../results/actors_casts.gexf')

['actor_type', 'movie', 'actor', 'role_type', 'role']


100%|██████████| 16613/16613 [02:57<00:00, 93.40it/s] 


## Dataset general statistics

In [46]:
n = len(G.nodes())
e = len(G.edges())

print('Number of nodes: ', n)
print('Number of edges: ', e)
print('Density: ', e / (n*(n-1)/2))

components = list(nx.connected_components(G))

print('Number of components: ', len(components))

Number of nodes:  16613
Number of edges:  152266
Density:  0.0011034761404392502
Number of components:  749


## Centralities

In [48]:
centralities = [nx.degree_centrality, nx.closeness_centrality,
 nx.betweenness_centrality, nx.eigenvector_centrality]

k = 10

for centrality in tqdm(centralities, total=len(centralities)):
    print('Top {} players by {}:'.format(k, centrality.__name__))
    c_dict = centrality(G)
    c_dict = dict(sorted(c_dict.items(), key=lambda item: item[1], reverse=True))
    for item in list(c_dict.items())[:k]:
        print(item)
    print('\n')

  0%|          | 0/4 [00:00<?, ?it/s]

Top 10 players by degree_centrality:
('Humphrey Bogart', 0.025824705032506622)
('James Stewart', 0.02245364796532627)
('Gary Cooper', 0.022212858174813388)
('John Gielgud', 0.022212858174813388)
('John Carradine', 0.022032265831928726)
('Peter Lorre', 0.02161088369853118)
('C.Aubrey Smith', 0.020226342403082107)
('Henry Fonda', 0.019443775583915242)
('Burt Lancaster', 0.018721406212376595)
('Basil Rathbone', 0.018601011317120154)


Top 10 players by closeness_centrality:


 50%|█████     | 2/4 [06:00<06:00, 180.04s/it]

('Charlton Heston', 0.3448314740754543)
('John Gielgud', 0.34392751970400837)
('Burt Lancaster', 0.3432436777371951)
('Henry Fonda', 0.34034977415556733)
('John Carradine', 0.3398032399416941)
('James Stewart', 0.33871542040968683)
('David Niven', 0.3382002705165573)
('Robert Mitchum', 0.3363793322090303)
('Humphrey Bogart', 0.33579389045964547)
('Laurence Olivier', 0.33534757288931855)


Top 10 players by betweenness_centrality:


 75%|███████▌  | 3/4 [40:19<16:02, 962.91s/it]

('Vincent Price', 0.010911758431554602)
('Burt Lancaster', 0.010772380644409017)
('John Carradine', 0.010626361313336918)
('Robert deNiro', 0.010466733936323008)
('Humphrey Bogart', 0.010386116460623892)
('Gene Hackman', 0.010042626763144201)
('John Gielgud', 0.009595726372221362)
('Jack Nicholson', 0.009222947872330209)
('Charlton Heston', 0.009194069187636999)
('James Stewart', 0.008190657113060142)


Top 10 players by eigenvector_centrality:


100%|██████████| 4/4 [40:20<00:00, 605.04s/it]

('C.Aubrey Smith', 0.10527677021679366)
('John Carradine', 0.09890125927888967)
('James Stewart', 0.09305085440923555)
('Peter Lorre', 0.09255638847118301)
('John Gielgud', 0.091583074734001)
('Basil Rathbone', 0.08981915239405143)
('Gary Cooper', 0.08938950751216483)
('David Niven', 0.08746691901018518)
('Andy Devine', 0.08728714563105724)
('Humphrey Bogart', 0.08503242693045049)







## Communities

In [80]:
communities = {node:cid+1 for cid,community in enumerate(nx.algorithms.community.k_clique_communities(G,40)) for node in community}
print(communities)

{'Lela Mourad': 1, 'Mervyn Johns': 1, 'Leonardo DiCaprio': 1, 'Laurence Olivier': 1, 'Basil Rathbone': 1, 'Norman Wooland': 1, 'Beverly Bane': 1, 'Nietta Zocchi': 1, 'John Gielgud': 3, 'Laurence Harvey': 1, 'Antionio Pierfrederici': 1, 'Roberto Bisacco': 1, 'Harry Hilliard': 1, 'John Barrymore': 1, 'Norma Shearer': 1, 'Natasha Peryy': 1, 'Gustav Serena': 1, 'Sir Godrey Teale': 1, 'C.Aubrey Smith': 1, 'Leslie Howard': 1, 'Conway Tearle': 1, 'Francesca Bertini': 1, 'Mario Caserini': 1, 'Enzo Fiermonte': 1, 'Bill Travers': 1, 'Esmeralda Ruspoli': 1, 'Flora Robson': 1, 'Ibrahim Hamouda': 1, 'Violet KembleCooper': 1, 'td> Claire Danes<': 1, 'Julia M. Taylor': 1, 'George A. Lessey': 1, 'Michael York': 1, 'Maria Gasperini': 1, 'Reginald Denny': 2, 'Francis X. Bushman': 1, 'Paul Panzer': 1, 'Robert Warwick': 1, 'Ralph Forbes': 1, 'Sebastian Cabot': 1, 'Rosemarie Dexter': 1, 'Paul Hardwick': 1, 'Henry Kolker': 1, 'Milo OShea': 1, 'Virginia Hammond': 1, 'John McEnery': 1, 'Meynier': 1, 'Aldo Zol

## Dataset reduction

In [6]:
# Filter the data

# print(casts[casts['actor_type'] < 'AA37']['movie'].value_counts().to_dict())
# print(casts['movie'].value_counts().to_dict())

movies_casts_cnt = casts['movie'].value_counts().to_dict()
print(len(movies_casts_cnt))
# print(movies_casts_cnt)

movies_filtered = list(k for (k, v) in movies_casts_cnt.items() if v < 6)
print(len(movies_filtered))
# print(movies_filtered)

8632
5232
