In [4]:
import pandas as pd
import numpy as np
import seaborn as sb
import networkx as nx

In [5]:
movie_industry = pd.read_csv('../data/movie_industry.csv', encoding="windows-1252")
oscar = pd.read_csv('../data/the_oscar_award.csv')
genome_scores = pd.read_csv('../data/genome-scores.csv')
genome_tags = pd.read_csv('../data/genome-tags.csv')
ratings = pd.read_csv('../data/ratings.csv')
tags = pd.read_csv('../data/tags.csv')
movies = pd.read_csv('../data/movies.csv')

In [20]:
class Graph:
    """ Wrapper class Graph to create a bipartite graph
    Takes in the following paramaters
    directors_to_actors_relation : Dataframe with the data for directors and actors
    weight func : function which defined the weight of an edge. Takes the dateframe to calcuate weights from and
    the nodes values to calculate for
    weight_func_args : column names used to calculate the weights
    director_column : Column name for director
    actor_column : Column name for actor, 
    bipartite : default true
    """
    def __init__(self, 
                 directors_to_actors_relation, 
                 weight_func, 
                 weight_func_args, 
                 director_column="director", 
                 actor_column="star", 
                 bipartite=True):
        self.G = nx.Graph()
        
        directors = set(directors_to_actors_relation[director_column].values)
        actors = set(directors_to_actors_relation[actor_column].values)
    
        #store the director node as a tuple with director name and boolean True to indicate director
        for director in directors:
            self.G.add_node((director, True))
            
        #store the actor node as a tuple with actor name and boolean False to indicate actor
        for actor in actors:
            self.G.add_node((actor, False))

        #add weights to all edges
        for director in directors:
            rows = directors_to_actors_relation[directors_to_actors_relation[director_column] == director]
            for index in rows.index.values:
                self.G.add_edge((director, True), 
                                (rows.loc[index, actor_column], False), 
                                weight=weight_func(*[rows.loc[index, i] for i in weight_func_args], directors_to_actors_relation))

In [21]:
#calcualtes edge weight as the number of collaborations
#takes in director name, star name and data to calculate weight from
def example_weight_func(director, star, df):
    return len(df.loc[((df["director"] == director) & (df["star"] == star))])

In [22]:
storage = Graph(movie_industry, example_weight_func, ["director", "star"])

In [23]:
#edges relating to Clint Eastwood as a director
storage.G.edges([("Clint Eastwood", True)])

EdgeDataView([(('Clint Eastwood', True), ('Clint Eastwood', False)), (('Clint Eastwood', True), ('Forest Whitaker', False)), (('Clint Eastwood', True), ('Kevin Costner', False)), (('Clint Eastwood', True), ('John Cusack', False)), (('Clint Eastwood', True), ('Sean Penn', False)), (('Clint Eastwood', True), ('Hilary Swank', False)), (('Clint Eastwood', True), ('Ken Watanabe', False)), (('Clint Eastwood', True), ('Ryan Phillippe', False)), (('Clint Eastwood', True), ('Angelina Jolie', False)), (('Clint Eastwood', True), ('Morgan Freeman', False)), (('Clint Eastwood', True), ('Matt Damon', False)), (('Clint Eastwood', True), ('Leonardo DiCaprio', False)), (('Clint Eastwood', True), ('Bradley Cooper', False)), (('Clint Eastwood', True), ('John Lloyd Young', False)), (('Clint Eastwood', True), ('Tom Hanks', False))])

In [24]:
#edges relating to Clint Eastwood as an actor
storage.G.edges([("Clint Eastwood", False)])

EdgeDataView([(('Clint Eastwood', False), ('Wolfgang Petersen', True)), (('Clint Eastwood', False), ('Clint Eastwood', True)), (('Clint Eastwood', False), ('Buddy Van Horn', True)), (('Clint Eastwood', False), ('Robert Lorenz', True))])

In [25]:
#weight of edges between Clint Eastwood as a director and Clint Eastwood as an actor
storage.G.get_edge_data(("Clint Eastwood", True), ("Clint Eastwood", False))

{'weight': 10}

In [46]:
movie_industry[(movie_industry.director == "Clint Eastwood") &
              (movie_industry.star == "Clint Eastwood")]

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
23,15000000.0,Jay Weston Productions,USA,Clint Eastwood,Action,42724017.0,Heartbreak Ridge,R,1986-12-05,130,6.8,Clint Eastwood,32954,James Carabatsos,1986
943,10000000.0,Warner Bros.,USA,Clint Eastwood,Action,21633874.0,The Rookie,R,1990-12-07,120,5.8,Clint Eastwood,21622,Boaz Yakin,1990
1002,24000000.0,Malpaso Productions,USA,Clint Eastwood,Adventure,2319124.0,White Hunter Black Heart,PG,1990-09-14,112,6.7,Clint Eastwood,9279,Peter Viertel,1990
1326,14400000.0,Warner Bros.,USA,Clint Eastwood,Drama,101157447.0,Unforgiven,R,1992-08-07,131,8.2,Clint Eastwood,304536,David Webb Peoples,1992
2043,24000000.0,Warner Bros.,USA,Clint Eastwood,Drama,71516617.0,The Bridges of Madison County,PG-13,1995-06-02,135,7.6,Clint Eastwood,58975,Richard LaGravenese,1995
2480,50000000.0,Castle Rock Entertainment,USA,Clint Eastwood,Action,50007168.0,Absolute Power,R,1997-02-14,121,6.7,Clint Eastwood,42417,David Baldacci,1997
2971,55000000.0,Warner Bros.,USA,Clint Eastwood,Crime,16635339.0,True Crime,R,1999-03-19,127,6.6,Clint Eastwood,27145,Andrew Klavan,1999
3158,65000000.0,Clipsal Films,USA,Clint Eastwood,Action,90464773.0,Space Cowboys,PG-13,2000-08-04,130,6.4,Clint Eastwood,63698,Ken Kaufman,2000
3643,50000000.0,Malpaso Productions,USA,Clint Eastwood,Action,26199517.0,Blood Work,R,2002-08-09,110,6.4,Clint Eastwood,36540,Michael Connelly,2002
4860,33000000.0,Matten Productions,Germany,Clint Eastwood,Drama,148095302.0,Gran Torino,R,2009-01-09,116,8.2,Clint Eastwood,609483,Nick Schenk,2008


In [44]:
#edge does not exits
storage.G.get_edge_data(("Clint Eastwood", True), ("Wil Wheaton", False))

In [32]:
#calcualtes edge weight as mean of gross collaborations
def mean_gross_weight_func(director, star, df):
    return np.mean(df.loc[((df["director"] == director) & (df["star"] == star))].gross.values)

storage2 = Graph(movie_industry, mean_gross_weight_func, ["director", "star"])

In [40]:
storage2.G.get_edge_data(("Clint Eastwood", True), ("Clint Eastwood", False))

{'weight': 57075317.8}

In [34]:
np.mean(movie_industry[(movie_industry.director == "Clint Eastwood") &
              (movie_industry.star == "Clint Eastwood")].gross.values)

57075317.8