In [1]:
import csv
import os
import math
import numpy as np
import pandas as pd

import neo4j
import psycopg2

In [2]:
dir_code = os.getcwd()
dir_data = os.path.join(os.path.dirname(dir_code), "data")

data_filename = "cites_elephant_ivory_trades_clean.csv"
data_filepath = os.path.join(dir_data, data_filename)

driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [3]:
# FUNCTION COPIED FROM LABS
def my_neo4j_wipe_out_database():
    """wipe out database by deleting all nodes and relationships"""
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)
    
    print("Cleared neo4j database!")
    
# FUNCTION COPIED FROM LABS   
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

# FUNCTION COPIED FROM LABS
def my_neo4j_number_nodes_relationships():
    """print the number of nodes and relationships"""
   
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")

# NEW FUNCTION, WRITTEN BY ANUSHKA
def my_neo4j_load_data(query, df, structure_type, desc):
    """run a query to load graph components from a dataframe"""
    
    # Convert df to dictionary dictionary (neo4j requires dictionaries to run queries on)
    df_as_dict = df.to_dict(orient="records")

    # Run query
    session.run(query, data=df_as_dict)
    
    print(f"\nLoaded {len(df)} {desc.upper()} {structure_type}s.")


In [4]:
# Read in clean data
df_elephant_ivory_trade = pd.read_csv(data_filepath)
df_elephant_ivory_trade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194592 entries, 0 to 194591
Data columns (total 10 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        194592 non-null  int64  
 1   year      194592 non-null  int64  
 2   taxon     194592 non-null  object 
 3   family    194592 non-null  object 
 4   term      194592 non-null  object 
 5   quantity  194592 non-null  float64
 6   unit      194592 non-null  object 
 7   importer  194592 non-null  object 
 8   exporter  194592 non-null  object 
 9   origin    36958 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 14.8+ MB


In [5]:
def create_graph_exporters(df):
    """
    Create elephant ivory trade exporters graph.
    
    Countries involved in ivory trades are represented as nodes. The graph includes the following relationships:
        - EXPORTED TO (country "A" (exporter) EXPORTED TO country "B" (importer))
        - EXPORTS ORIGINATED FROM (country "A" (exporter) EXPORTS ORIGINATED FROM country "C" (origin))
            - Exporters are not always exporting ivory that was originally sourced from their country
            - origin not populated for all trades, totals will not match
    """

    my_neo4j_wipe_out_database()
    
    # ----- CREATE NODES: COUNTRIES -----
    
    # Create df with unique list of countries
    countries_stack = pd.concat([df["importer"], df["exporter"], df["origin"]])
    countries = countries_stack.dropna().drop_duplicates().to_frame(name="country")
    
    # Define query for loading country nodes
    query_nodes_countries = """
        UNWIND $data as row
        
        CREATE (:Country {
            name: row.country
        })
    """ 
    
    # Run query to create graph nodes
    my_neo4j_load_data(query_nodes_countries, countries, "node", "country")
    
    # Define query to create an index for the country nodes
    query_country_index = """
        CREATE INDEX country_index IF NOT EXISTS FOR (country:Country) ON (country.name);
    """
    
    # Run query to index the country nodes
    session.run(query_country_index)
    print(f"Created index for country nodes.")
    
    # ----- CREATE RELATIONSHIPS: EXPORTED TO -----
    
    # Create aggregated df to get exporter relationships
    exporters_summ = df[df["exporter"].notna()].groupby(["exporter", "importer"]).agg(
        specimens_ivory = ("quantity", "sum"),
        n_trades = ("quantity", "size")
    ).reset_index()
    
    # Define query for loading exporter relationships
    query_relationship_exporters = """
        UNWIND $data as row
        
        MATCH (importer:Country {name: row.importer}), 
              (exporter:Country {name: row.exporter})
        CREATE (exporter)-[:EXPORTED_TO {weight: row.specimens_ivory}]->(importer)
    """
    
    # Run query to create exporter relationships
    my_neo4j_load_data(query_relationship_exporters, exporters_summ, "relationship", "EXPORTED_TO")
    
    # ----- CREATE RELATIONSHIPS: EXPORTS ORIGINATED FROM -----
    
    # Create aggregated df to get export originating relationships
    export_origins_summ = df[df["exporter"].notna()].groupby(["exporter", "origin"]).agg(
        specimens_ivory = ("quantity", "sum"),
        n_trades = ("quantity", "size")
    ).reset_index()
    
    # Define query for loading export originating relationships
    query_relationship_export_origins = """
        UNWIND $data as row
        
        MATCH (origin:Country {name: row.origin}), 
              (exporter:Country {name: row.exporter})
        CREATE (exporter)-[:EXPORTS_ORIGINATED_FROM {weight: row.specimens_ivory}]->(origin)
    """
    
    # Run query to create export originating relationships
    my_neo4j_load_data(query_relationship_export_origins, export_origins_summ, "relationship", "EXPORTS_ORIGINATED_FROM")
    
    print("\nDone loading graph!")
    

In [6]:
create_graph_exporters(df_elephant_ivory_trade)

Cleared neo4j database!

Loaded 223 COUNTRY nodes.
Created index for country nodes.

Loaded 2885 EXPORTED_TO relationships.

Loaded 1129 EXPORTS_ORIGINATED_FROM relationships.

Done loading graph!


In [7]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 223
  Relationships: 4014
-------------------------


# ANALYTICS

In [8]:
# Project the exporter graph into GDS
#    (similar pattern to 'ds_graph' in lab 9)

# Drop existing GDS graph 
query = "CALL gds.graph.drop('ivory_export_graph', false) YIELD graphName"
try:
    session.run(query)
    print("Dropped existing GDS graph 'ivory_export_graph'")
except:
    print("No existing 'ivory_export_graph' to drop.")

# Project :Country nodes and :EXPORTED_TO relationships (with weight)
query = """
CALL gds.graph.project(
  'ivory_export_graph',
  'Country',
  {
    EXPORTED_TO: {
      properties: 'weight'
    }
  }
)
YIELD graphName, nodeCount, relationshipCount
"""
df_proj = my_neo4j_run_query_pandas(query)
print("Projected ivory_export_graph into GDS:")
display(df_proj)



# PageRank: most influential exporting countries
#    (on EXPORTED_TO relationships)

query = """
CALL gds.pageRank.stream('ivory_export_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor,
                           relationshipWeightProperty: 'weight'}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS country, score AS page_rank
ORDER BY page_rank DESC, country ASC
"""

max_iterations = 20
damping_factor = 0.85

df_exporter_pagerank = my_neo4j_run_query_pandas(
    query,
    max_iterations=max_iterations,
    damping_factor=damping_factor
)

print("\nTop exporters by PageRank (ivory_export_graph):")
display(df_exporter_pagerank.head(10))


Dropped existing GDS graph 'ivory_export_graph'
Projected ivory_export_graph into GDS:


Unnamed: 0,graphName,nodeCount,relationshipCount
0,ivory_export_graph,223,2885



Top exporters by PageRank (ivory_export_graph):


Unnamed: 0,country,page_rank
0,US,44.767792
1,CA,20.787474
2,HK,15.900936
3,DE,15.153764
4,JP,13.770176
5,GB,13.564888
6,CH,6.997398
7,FR,6.665679
8,IT,3.10004
9,DK,2.704293


In [9]:
# Betweenness Centrality: intermediate countries acting as middlemen


query = """
CALL gds.betweenness.stream('ivory_export_graph',
                            {relationshipWeightProperty: 'weight'})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS country, score AS betweenness
ORDER BY betweenness DESC, country ASC
"""

df_exporter_betweenness = my_neo4j_run_query_pandas(query)

print("\nCountries with highest betweenness (middlemen in exporter graph):")
display(df_exporter_betweenness.head(10))


Countries with highest betweenness (middlemen in exporter graph):


Unnamed: 0,country,betweenness
0,US,10045.473303
1,CH,7563.802943
2,DE,6032.622351
3,ZW,5611.924859
4,CA,5569.256746
5,NZ,5055.615127
6,GB,4000.526717
7,FR,3625.533889
8,ES,3244.432863
9,NL,3179.266789


In [10]:
# Degree Centrality: countries with the most direct imports/exports

query = """
CALL gds.degree.stream('ivory_export_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS country, score AS degree
ORDER BY degree DESC, country ASC
"""

df_exporter_degree = my_neo4j_run_query_pandas(query)

print("\nCountries with highest degree (most direct EXPORTED_TO connections):")
display(df_exporter_degree.head(10))


Countries with highest degree (most direct EXPORTED_TO connections):


Unnamed: 0,country,degree
0,ZW,150.0
1,ZA,112.0
2,GB,100.0
3,US,98.0
4,FR,85.0
5,DE,75.0
6,CH,74.0
7,CN,67.0
8,TZ,65.0
9,BW,63.0


In [11]:
# For countries exporting ivory, what countries do they want their exports to originate from?
# (Using :EXPORTS_ORIGINATED_FROM relationships)


def exporter_origin_preferences(top_n_per_exporter=3):
    """
    For each exporter, summarize which origin countries their exports
    are reported to originate from, using EXPORTS_ORIGINATED_FROM relationships.

    Returns:
      - full dataframe of (exporter, origin, total_ivory, n_trades)
      - top_n_per_exporter origins per exporter (by total_ivory)
    """

    query = """
    MATCH (exporter:Country)-[r:EXPORTS_ORIGINATED_FROM]->(origin:Country)
    RETURN
      exporter.name AS exporter,
      origin.name   AS origin,
      sum(r.weight) AS total_ivory,
      count(r)      AS n_trades
    ORDER BY exporter, total_ivory DESC
    """

    df = my_neo4j_run_query_pandas(query)

    # Sort and keep top N origins per exporter by total_ivory
    df_sorted = df.sort_values(["exporter", "total_ivory"], ascending=[True, False])
    df_top = df_sorted.groupby("exporter").head(top_n_per_exporter).reset_index(drop=True)

    return df, df_top


# analysis
df_exporter_origin_all, df_exporter_origin_top = exporter_origin_preferences(top_n_per_exporter=3)

# for each exporting country,which origin countries are the source of the ivory and how much ivory came from each origin.

print("All exporter → origin pairs (aggregated):")
display(df_exporter_origin_all.head(200))


print("\nTop 3 origin countries per exporter (by total ivory volume):")
display(df_exporter_origin_top.head(200))


All exporter → origin pairs (aggregated):


Unnamed: 0,exporter,origin,total_ivory,n_trades
0,AE,TZ,4669.000000,1
1,AE,CF,987.000000,1
2,AE,ZM,637.000000,1
3,AE,CG,402.000000,1
4,AE,ZW,315.000000,1
...,...,...,...,...
195,CH,SD,1668.000000,1
196,CH,MO,724.166667,1
197,CH,HK,724.000000,1
198,CH,ET,717.000000,1



Top 3 origin countries per exporter (by total ivory volume):


Unnamed: 0,exporter,origin,total_ivory,n_trades
0,AE,TZ,4669.0,1
1,AE,CF,987.0,1
2,AE,ZM,637.0,1
3,AF,KE,2.0,1
4,AM,SD,14.0,1
...,...,...,...,...
195,NL,TZ,938.0,1
196,NL,KE,876.0,1
197,NL,CG,255.0,1
198,NO,DE,620.0,1


In [12]:
# Top 10 exporter → destination pairs by highest total ivory volume


def top10_exporter_destination():
    """
    Returns the top 10 exporter → destination ivory flows
    based purely on the highest total ivory volume globally.
    
    This answers:
      "For countries exporting ivory, what countries do they
       send the most ivory to?"
    """

    query = """
    MATCH (exporter:Country)-[r:EXPORTED_TO]->(destination:Country)
    RETURN
      exporter.name    AS exporter,
      destination.name AS destination,
      sum(r.weight)    AS total_ivory
    ORDER BY total_ivory DESC
    """

    df = my_neo4j_run_query_pandas(query)

    # Top 10 flows globally
    df_top10 = df.head(10)

    return df_top10


# Run
df_exporter_dest_top10 = top10_exporter_destination()

print("Top 10 exporter → destination pairs by total ivory volume:")
display(df_exporter_dest_top10)


Top 10 exporter → destination pairs by total ivory volume:


Unnamed: 0,exporter,destination,total_ivory
0,CN,HK,115895839.0
1,HK,US,35130640.0
2,HK,JP,31578157.0
3,HK,DE,3140050.0
4,HK,IT,1871110.0
5,GB,DE,1783435.0
6,IN,US,1401065.0
7,TW,JP,1014246.0
8,CN,TH,1000018.0
9,IN,DE,966226.0


In [67]:
my_neo4j_wipe_out_database()

Cleared neo4j database!
