this file takes a set of movie.json files and turns them into a graphml files that you can then 
import into neo4j desktop

to see how to actually import a graphml file into neo4j. install the apoc plugin in neo4j-desktop, and look at this stackoverflow post :

https://stackoverflow.com/questions/52210619/how-to-import-a-networkx-graph-to-neo4j#52571797


In [36]:
import pandas as pd
import os
import json
import pprint
import networkx as nx



In [37]:
main_data = pd.read_csv("../imdb_data.csv")

In [38]:
movie_dir = "movies"

In [39]:
columns = ['id', 'primaryTitle', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'budget', 'gross', 'release_date']

def load_and_prep(main_data : pd.DataFrame, movie_dir : str, columns : list, number_of_movies=100):
    '''
    take the data from a movie cast folder, pairs it with datafrom the imdb movie data.
    if a partiular value is missing for whatever reason it's marked with a -42
    '''
    movie_list = os.listdir(movie_dir)
    cast_and_movie = []
    for movie in movie_list[:number_of_movies]:
        relation_dict = {}
        with open(f"{movie_dir}/{movie}", "r") as in_file:
            movie_dict = json.load(in_file)
            for column in columns:
                record = main_data[main_data["primaryTitle"] == movie_dict["name"]]
                if not record.empty:
                    r_data = record[column]
                    r_data.fillna(-42, inplace=True)
                    relation_dict[column] = r_data.values[0]
        movie_dict['imdb_data'] = relation_dict
        cast_and_movie.append(movie_dict)

    return cast_and_movie

cast_and_movie = load_and_prep(main_data, movie_dir, columns, number_of_movies=2500)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_data.fillna(-42, inplace=True)
  r_data.fillna(-42, inplace=True)


In [40]:
cast_and_movie[0]['imdb_data']

{'id': 'tt1646987',
 'primaryTitle': 'Wrath of the Titans',
 'runtimeMinutes': 99,
 'genres': 'Action,Adventure,Fantasy',
 'averageRating': 5.7,
 'numVotes': 193787,
 'budget': 150000000,
 'gross': 301970083.0,
 'release_date': 'March 28, 2012'}

In [41]:
# what specific departments from a movie that one wishes to include
# for now I just added the important looking ones --Nico
role_set = ['director', 'writer', 'producer', 'composer', 'cinematographer', 'editor', 'cast']

def add_relationships(g : nx.MultiGraph, people_set : set, imdb_data : dict):
    # add edges between all the nodes
    # each edge represents a movie that the two people worked on
    # includes all data about that movie from imdb.csv except for 
    edge_set = set({})
    while len(people_set) != 0:
        person_1 = people_set.pop()
        for person_2 in people_set:
            edge_set.add((person_1, person_2))

    for edge in edge_set:
        g.add_edge(*edge, **imdb_data)
    return g

def add_people_nodes(g : nx.MultiGraph, set_of_people : list, new_role : str):
    # adds a set of nodes which represnts people to the graph
    # they are labled with the set of jobs they played durring their carrier 
    # represented by a simple comma seperated list
    for person in set_of_people:
        if g.has_node(person):
            g.nodes[person]["roles"].add(new_role)
        else:
            g.add_node(person, name=person, roles=set({new_role}))

def fix_lables(g : nx.MultiGraph):
    # a last minute addition since it seems graph ml doesn't like lists 
    new_node_lables = {}
    for node in g.nodes:
        new_node_lables[node] = ",".join(g.nodes[node]["roles"])
    nx.set_node_attributes(g, new_node_lables, name="roles")

def load_data(cast_and_movie : list, role_set : list, number_of_cast=10):
    # basicly takes in a list of movies (each represented by json records) 
    # and builds a social graph where each node is a person and each edge is a movie 
    # that two people worked on togther 
    pos = 0
    graph_g = nx.MultiGraph()
    for movie in cast_and_movie:
        # people are in a set to cut down on duplicate nodes
        # a single person can have multiple roles after all
        people_set = set()
        for role_name in role_set:       
            # extra check since cast has a lot of 
            # not super relevent people     
            if role_name != 'cast':
                # get all the people based in what job
                # they had in the movie
                people = movie[role_name]
                add_people_nodes(graph_g, people, role_name)
            else:
                people = movie[role_name][:number_of_cast]
                add_people_nodes(graph_g, people, role_name)
            people_set.update(people)
        add_relationships(graph_g, people_set, movie['imdb_data'])
        
    # turns out graphml doesn't like collections
    fix_lables(graph_g)

    return graph_g

graph_g = load_data(cast_and_movie, role_set, 8)
print(graph_g)

MultiGraph with 22748 nodes and 591832 edges


In [42]:
print(graph_g)


MultiGraph with 22748 nodes and 591832 edges


In [43]:
nx.write_graphml(graph_g, './main.graphml')
