In [1]:
import csv
import os
import math
import numpy as np
import pandas as pd

import neo4j
import psycopg2

In [2]:
dir_code = os.getcwd()
dir_data = os.path.join(os.path.dirname(dir_code), "data")

data_filename = "cites_elephant_ivory_trades_clean.csv"
data_filepath = os.path.join(dir_data, data_filename)

driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [3]:
# FUNCTION COPIED FROM LABS
def my_neo4j_wipe_out_database():
    """wipe out database by deleting all nodes and relationships"""
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)
    
    print("Cleared neo4j database!")
    
# FUNCTION COPIED FROM LABS   
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

# FUNCTION COPIED FROM LABS
def my_neo4j_number_nodes_relationships():
    """print the number of nodes and relationships"""
   
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")

# NEW FUNCTION, WRITTEN BY ANUSHKA
def my_neo4j_load_data(query, df, structure_type, desc):
    """run a query to load graph components from a dataframe"""
    
    # Convert df to dictionary dictionary (neo4j requires dictionaries to run queries on)
    df_as_dict = df.to_dict(orient="records")

    # Run query
    session.run(query, data=df_as_dict)
    
    print(f"\nLoaded {len(df)} {desc.upper()} {structure_type}s.")


In [4]:
# Read in clean data
df_elephant_ivory_trade = pd.read_csv(data_filepath)
df_elephant_ivory_trade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194592 entries, 0 to 194591
Data columns (total 10 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        194592 non-null  int64  
 1   year      194592 non-null  int64  
 2   taxon     194592 non-null  object 
 3   family    194592 non-null  object 
 4   term      194592 non-null  object 
 5   quantity  194592 non-null  float64
 6   unit      194592 non-null  object 
 7   importer  194592 non-null  object 
 8   exporter  194592 non-null  object 
 9   origin    36958 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 14.8+ MB


In [5]:
def create_graph_exporters(df):
    """
    Create elephant ivory trade exporters graph.
    
    Countries involved in ivory trades are represented as nodes. The graph includes the following relationships:
        - EXPORTED TO (country "A" (exporter) EXPORTED TO country "B" (importer))
        - EXPORTS ORIGINATED FROM (country "A" (exporter) EXPORTS ORIGINATED FROM country "C" (origin))
            - Exporters are not always exporting ivory that was originally sourced from their country
            - origin not populated for all trades, totals will not match
    """

    my_neo4j_wipe_out_database()
    
    # ----- CREATE NODES: COUNTRIES -----
    
    # Create df with unique list of countries
    countries_stack = pd.concat([df["importer"], df["exporter"], df["origin"]])
    countries = countries_stack.dropna().drop_duplicates().to_frame(name="country")
    
    # Define query for loading country nodes
    query_nodes_countries = """
        UNWIND $data as row
        
        CREATE (:Country {
            name: row.country
        })
    """ 
    
    # Run query to create graph nodes
    my_neo4j_load_data(query_nodes_countries, countries, "node", "country")
    
    # Define query to create an index for the country nodes
    query_country_index = """
        CREATE INDEX country_index IF NOT EXISTS FOR (country:Country) ON (country.name);
    """
    
    # Run query to index the country nodes
    session.run(query_country_index)
    print(f"Created index for country nodes.")
    
    # ----- CREATE RELATIONSHIPS: EXPORTED TO -----
    
    # Create aggregated df to get exporter relationships
    exporters_summ = df[df["exporter"].notna()].groupby(["exporter", "importer"]).agg(
        specimens_ivory = ("quantity", "sum"),
        n_trades = ("quantity", "size")
    ).reset_index()
    
    # Define query for loading exporter relationships
    query_relationship_exporters = """
        UNWIND $data as row
        
        MATCH (importer:Country {name: row.importer}), 
              (exporter:Country {name: row.exporter})
        CREATE (exporter)-[:EXPORTED_TO {weight: row.specimens_ivory}]->(importer)
    """
    
    # Run query to create exporter relationships
    my_neo4j_load_data(query_relationship_exporters, exporters_summ, "relationship", "EXPORTED_TO")
    
    # ----- CREATE RELATIONSHIPS: EXPORTS ORIGINATED FROM -----
    
    # Create aggregated df to get export originating relationships
    export_origins_summ = df[df["exporter"].notna()].groupby(["exporter", "origin"]).agg(
        specimens_ivory = ("quantity", "sum"),
        n_trades = ("quantity", "size")
    ).reset_index()
    
    # Define query for loading export originating relationships
    query_relationship_export_origins = """
        UNWIND $data as row
        
        MATCH (origin:Country {name: row.origin}), 
              (exporter:Country {name: row.exporter})
        CREATE (exporter)-[:EXPORTS_ORIGINATED_FROM {weight: row.specimens_ivory}]->(origin)
    """
    
    # Run query to create export originating relationships
    my_neo4j_load_data(query_relationship_export_origins, export_origins_summ, "relationship", "EXPORTS_ORIGINATED_FROM")
    
    print("\nDone loading graph!")
    

In [6]:
create_graph_exporters(df_elephant_ivory_trade)

Cleared neo4j database!

Loaded 223 COUNTRY nodes.
Created index for country nodes.

Loaded 2885 EXPORTED_TO relationships.

Loaded 1129 EXPORTS_ORIGINATED_FROM relationships.

Done loading graph!


In [7]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 223
  Relationships: 4014
-------------------------


In [8]:
my_neo4j_wipe_out_database()

Cleared neo4j database!
