# Data Scraping and Processing, Main Graph Construction

In [1]:
import networkx as nx
import csv
import matplotlib.pyplot as plot
from bs4 import BeautifulSoup
import urllib2

## Scraping Wikipedia

In [None]:
page0 = urllib2.urlopen("https://en.wikipedia.org/wiki/List_of_actors_with_Academy_Award_nominations")
soup0 = BeautifulSoup(page0, 'html.parser')
actors0 = soup0.find_all('tr')[11:][:-11]

In [None]:
academylist = []
for actor in actors0:
    academylist.append(actor.find('a').text)

## Scraping BoxOfficeMojo

In [None]:
page1 = urllib2.urlopen("https://www.boxofficemojo.com/people/?view=Actor&p=.htm")
soup1 = BeautifulSoup(page1, 'html.parser')
actors1 = soup1.find_all('tr')[2].find_all('tr')[1:]

In [None]:
page2 = urllib2.urlopen("https://www.boxofficemojo.com/people/?view=Actor&pagenum=2&sort=person&order=ASC&p=.htm")
soup2 = BeautifulSoup(page2, 'html.parser')
actors2 = soup2.find_all('tr')[2].find_all('tr')[1:]

In [None]:
page3 = urllib2.urlopen("https://www.boxofficemojo.com/people/?view=Actor&pagenum=3&sort=person&order=ASC&p=.htm")
soup3 = BeautifulSoup(page3, 'html.parser')
actors3 = soup3.find_all('tr')[2].find_all('tr')[1:]

In [None]:
boxofficedict = {}
def addtodict(actors):
    for actor in actors:
        name = actor.find('b').find('b').text.replace(",", "")
        boxoffice = str(actor.find('td', attrs={'align':'right'}).text[1:]).replace(",", "")
        if "k" in boxoffice:
            boxoffice = float(boxoffice.replace("k", ""))/1000
        boxofficedict[name] = float(boxoffice)

In [None]:
addtodict(actors1)
addtodict(actors2)
addtodict(actors3)

## Processing IMDb Data

In [None]:
with open('title.basics.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    i = 0
    for row in reader:
        if i < 200:
            print(row)
            i = i + 1
        else:
            break

In [None]:
movies = {}
with open('title.basics.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    i = 0
    for row in reader:
        movies[row[0]] = row[1]

In [None]:
dict = {}
birthdict = {}
deathdict = {}
with open('name.basics.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    i = 0
    for row in reader:
        if i < 200:
            print(row)
            i = i + 1
        if i > 1:
            dict[row[0]] = row[1]
            birthdict[row[0]] = row[2]
            if row[3] != '\N':
                deathdict[row[0]] = row[3]

## Graph Constructions

In [None]:
G = nx.Graph()
Gw = nx.Graph()
Gm = nx.Graph()
Gweighted = nx.Graph()
moviecountdict = {}
agedict = {}
with open('title.principals.tsv') as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    prev = ""
    actors = []
    for row in reader:
        if row[3] == 'actor' and row[2] in dict and dict[row[2]] in boxofficedict:
            Gm.add_node(dict[row[2]])
            G.add_node(dict[row[2]])
            Gweighted.add_node(dict[row[2]])
            if dict[row[2]] in moviecountdict:
                moviecountdict[dict[row[2]]] += 1
            else:
                moviecountdict[dict[row[2]]] = 1
            agedict[dict[row[2]]] = 0 if birthdict[row[2]] == '\N' else 2019-int(birthdict[row[2]])
        elif row[3] == 'actress' and row[2] in dict and dict[row[2]] in boxofficedict:
            Gw.add_node(dict[row[2]])
            G.add_node(dict[row[2]])
            Gweighted.add_node(dict[row[2]])
            if dict[row[2]] in moviecountdict:
                moviecountdict[dict[row[2]]] += 1
            else:
                moviecountdict[dict[row[2]]] = 1
            agedict[dict[row[2]]] = 0 if birthdict[row[2]] == '\N' else 2019-int(birthdict[row[2]])
        else:
            continue
        if row[0] != prev:
            for i in range(len(actors)):
                for j in range(i, len(actors)):
                    a1 = actors[i]
                    a2 = actors[j]
                    if a1 == a2:
                        continue
                    G.add_edge(a1, a2)
                    if Gweighted.has_edge(a1, a2):
                        Gweighted.add_edge(a1, a2, weight=(Gweighted.get_edge_data(a1, a2)['weight'] + 1))
                    else:
                        Gweighted.add_edge(a1, a2, weight=1)
                    if a1 in Gw.nodes and a2 in Gw.nodes:
                        Gw.add_edge(a1, a2)
                    elif a1 in Gm.nodes and a2 in Gm.nodes:
                        Gm.add_edge(a1, a2)
            actors = [dict[row[2]]]
        else:
            actors.append(dict[row[2]])
        prev = row[0]

In [None]:
actorw = sorted(Gw.nodes)
actorm = sorted(Gm.nodes)

In [None]:
actorlist = [a for a in actorw]
for a in actorm:
    actorlist.append(a)

In [None]:
for a in agedict:
    if a in actorlist:
        print a, agedict[a]

In [None]:
missing = []
for a in agedict:
    if agedict[a] == 0:
        missing.append(a)

In [None]:
ages = [1973, 1964, 1968, 1966, 1997, 1972, 1984, 1954, 1951, 1965, 1958, 1988, 1989, 1931, 1968, 1990, 1966, 1928, 1972, 1969, 1974, 1969, 1976, 1922, 1997, 1995]
missing = zip(missing, ages)

In [None]:
for i in range(len(missing)):
    agedict[missing[i][0]] = 2019-missing[i][1]

In [None]:
Gtop = nx.Graph()
Glow = nx.Graph()
for a in G.nodes():
    if a in academylist:
        Gtop.add_node(a)
    else:
        Glow.add_node(a)
for (u, v) in G.edges():
    if u in academylist and v in academylist:
        Gtop.add_edge(u, v)
    elif u not in academylist and v not in academylist:
        Glow.add_edge(u, v)