In [1]:
from collections import defaultdict

import networkx as nx
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Parse the html file with all the performers and shows

with open('performers2017.html', 'r') as html_doc:
    data = html_doc.read()
    
soup = BeautifulSoup(data, 'html.parser')

all_shows = defaultdict(list)
rows = soup.find(id='performer_table').find_all('tr')
for row in rows:
    name = row.a.text.replace('  ', ' ')
    shows = row.find_all('td')[1].text.strip().replace('\t', '').split('\n')
    shows = [show.strip() for show in shows]

    for show in shows:
        all_shows[show].append(name)

In [3]:
# The person-show graph
show_graph = nx.Graph()
for show in all_shows:
    show_graph.add_node(show)
    for person in all_shows[show]:
        show_graph.add_node(person)
        show_graph.add_edge(person, show)

In [4]:
# The person-person graph
people_graph = nx.Graph()
for show in all_shows:
    for person1 in all_shows[show]:
        people_graph.add_node(person1)
        for person2 in all_shows[show]:
            people_graph.add_node(person2)
            if person1 != person2:
                people_graph.add_edge(person1, person2)

In [5]:
show_graph.size()

4810

In [6]:
people_graph.size()

17458

In [7]:
degrees = pd.DataFrame([(k, v) for k, v in nx.degree(people_graph).iteritems()], columns=['name', 'degree'])
degrees[~degrees.name.str.contains('June')].sort_values(by='degree', ascending=False).head(10)

Unnamed: 0,name,degree
1134,Alexandra Song,124
291,Caroline Cotter,103
825,"Glorilis ""Glo"" Tavarez",100
674,Lui Vega,98
884,Patrick Keene,94
2204,Kelsey Bailey,87
192,Molly Gaebe,85
75,Andy Bustillos,85
944,Corin C Wells,84
641,Rachel Taenzler,84


In [8]:
people_betweenness = pd.DataFrame([(k, v) for k, v in nx.betweenness_centrality(people_graph).iteritems()], columns=['name', 'betweenness'])
people_betweenness.sort_values(by='betweenness', ascending=False).head(10)

Unnamed: 0,name,betweenness
152,Shenovia Large,0.05135
601,Bob Vulfov,0.039067
97,Devin Ritchie,0.029745
307,Cecilia De Robertis,0.027676
1763,Dan Lee,0.027628
1521,JJ Jackson,0.023753
371,Richie Khanh,0.022864
418,Will Choi,0.019552
1239,Keiko Agena,0.019378
247,Adrian Frimpong,0.019132


In [9]:
show_betweenness = pd.DataFrame([(k, v) for k, v in nx.betweenness_centrality(show_graph).iteritems()], columns=['name', 'betweenness'])
show_betweenness[show_betweenness.name.str.contains('June')].sort_values(by="betweenness", ascending=False).head(10)

Unnamed: 0,name,betweenness
769,Worldwide WOC Mash-Up Show Saturday June 24 3:...,0.060349
3036,Scarlett Johansson Presents Friday June 23 5:3...,0.055486
1075,ImmigrantProv Mash-Up Show Sunday June 25 1:30...,0.052333
3240,FUSION: An Asian Mash-Up Show Friday June 23 1...,0.048387
538,Lost In Translation Mash-Up Show Saturday June...,0.034886
2361,Older Than Most Mash-Up Show Saturday June 24 ...,0.030835
2284,"Queer, Far, Wherever You Are Mash Up Show Satu...",0.030332
765,Tribes Saturday June 24 10:00pm TNC - Cino,0.027879
1303,Getting Out Of Jury Duty with Judge Richard FI...,0.026374
1857,Better With With Some Color Saturday June 24 2...,0.023092


In [10]:
# Output graphml for analysis in Gephi.
nx.write_graphml(people_graph, 'dcm2017.graphml')
nx.write_graphml(show_graph, 'shows_dcm2017.graphml')