In [16]:
import csv
import os
import math
import numpy as np
import pandas as pd

import neo4j
import psycopg2

In [17]:
dir_code = os.getcwd()
dir_data = os.path.join(os.path.dirname(dir_code), "data")

data_filename = "cites_elephant_ivory_trades_clean.csv"
data_filepath = os.path.join(dir_data, data_filename)

driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [18]:
# FUNCTION COPIED FROM LABS
def my_neo4j_wipe_out_database():
    """wipe out database by deleting all nodes and relationships"""
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)
    
    print("Cleared neo4j database!")
    
# FUNCTION COPIED FROM LABS   
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

# FUNCTION COPIED FROM LABS
def my_neo4j_number_nodes_relationships():
    """print the number of nodes and relationships"""
   
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")

# NEW FUNCTION, WRITTEN BY ANUSHKA
def my_neo4j_load_data(query, df, structure_type, desc):
    """run a query to load graph components from a dataframe"""
    
    # Convert df to dictionary dictionary (neo4j requires dictionaries to run queries on)
    df_as_dict = df.to_dict(orient="records")

    # Run query
    session.run(query, data=df_as_dict)
    
    print(f"\nLoaded {len(df)} {desc.upper()} {structure_type}s.")


In [19]:
# Read in clean data
df_elephant_ivory_trade = pd.read_csv(data_filepath)
df_elephant_ivory_trade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194592 entries, 0 to 194591
Data columns (total 10 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        194592 non-null  int64  
 1   year      194592 non-null  int64  
 2   taxon     194592 non-null  object 
 3   family    194592 non-null  object 
 4   term      194592 non-null  object 
 5   quantity  194592 non-null  float64
 6   unit      194592 non-null  object 
 7   importer  194592 non-null  object 
 8   exporter  194592 non-null  object 
 9   origin    36958 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 14.8+ MB


In [20]:
def create_graph_importers(df):
    """
    Create elephant ivory trade importers graph.
    
    Countries involved in ivory trades are represented as nodes. The graph includes the following relationships:
        - IMPORTED FROM (country "A" (importer) IMPORTED FROM country "B" (exporter))
        - IMPORTS ORIGINATED FROM (country "A" (importer) IMPORTS ORIGINATED FROM country "C" (origin))
            - Exporters are not always exporting ivory that was originally sourced from their country
            - origin not populated for all trades, totals will not match
    """

    my_neo4j_wipe_out_database()
    
    # ----- CREATE NODES: COUNTRIES -----
    
    # Create df with unique list of countries
    countries_stack = pd.concat([df["importer"], df["exporter"], df["origin"]])
    countries = countries_stack.dropna().drop_duplicates().to_frame(name="country")
    
    # Define query for loading country nodes
    query_nodes_countries = """
        UNWIND $data as row
        
        CREATE (:Country {
            name: row.country
        })
    """ 
    
    # Run query to create graph nodes
    my_neo4j_load_data(query_nodes_countries, countries, "node", "country")
    
    # Define query to create an index for the country nodes
    query_country_index = """
        CREATE INDEX country_index IF NOT EXISTS FOR (country:Country) ON (country.name);
    """
    
    # Run query to index the country nodes
    session.run(query_country_index)
    print(f"Created index for country nodes.")
    
    # ----- CREATE RELATIONSHIPS: IMPORTED FROM -----
    
    # Create aggregated df to get importer relationships
    importers_summ = df[df["importer"].notna()].groupby(["importer", "exporter"]).agg(
        specimens_ivory = ("quantity", "sum"),
        n_trades = ("quantity", "size")
    ).reset_index()
    
    # Define query for loading importer relationships
    query_relationship_importers = """
        UNWIND $data as row
        
        MATCH (exporter:Country {name: row.exporter}), 
              (importer:Country {name: row.importer})
        CREATE (importer)-[:IMPORTED_FROM {weight: row.specimens_ivory}]->(exporter)
    """
    
    # Run query to create importer relationships
    my_neo4j_load_data(query_relationship_importers, importers_summ, "relationship", "IMPORTED_FROM")
    
    # ----- CREATE RELATIONSHIPS: IMPORTS ORIGINATED FROM -----
    
    # Create aggregated df to get import originating relationships
    import_origins_summ = df[df["importer"].notna()].groupby(["importer", "origin"]).agg(
        specimens_ivory = ("quantity", "sum"),
        n_trades = ("quantity", "size")
    ).reset_index()
    
    # Define query for loading import originating relationships
    query_relationship_import_origins = """
        UNWIND $data as row
        
        MATCH (origin:Country {name: row.origin}), 
              (importer:Country {name: row.importer})
        CREATE (importer)-[:IMPORTS_ORIGINATED_FROM {weight: row.specimens_ivory}]->(origin)
    """
    
    # Run query to create import originating relationships
    my_neo4j_load_data(query_relationship_import_origins, import_origins_summ, "relationship", "IMPORTS_ORIGINATED_FROM")
    
    print("\nDone loading graph!")
    

In [21]:
create_graph_importers(df_elephant_ivory_trade)

Cleared neo4j database!

Loaded 223 COUNTRY nodes.
Created index for country nodes.

Loaded 2885 IMPORTED_FROM relationships.

Loaded 1199 IMPORTS_ORIGINATED_FROM relationships.

Done loading graph!


In [22]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 223
  Relationships: 4084
-------------------------


# ANALYTICS

In [23]:
# Project the importer graph into GDS (similar to 'ds_graph' in lab 9)
# Drop existing GDS graph 

query = "CALL gds.graph.drop('ivory_import_graph', false) YIELD graphName"
try:
    session.run(query)
    print("Dropped existing GDS graph 'ivory_import_graph'")
except:
    print("No existing 'ivory_import_graph' to drop.")

# Project :Country nodes and :IMPORTED_FROM relationships (with weight)
query = """
CALL gds.graph.project(
  'ivory_import_graph',
  'Country',
  {
    IMPORTED_FROM: {
      properties: 'weight'
    }
  }
)
YIELD graphName, nodeCount, relationshipCount
"""
df_import_proj = my_neo4j_run_query_pandas(query)
print("Projected ivory_import_graph into GDS:")
display(df_import_proj)


# PageRank: most influential importing countries
#    (on IMPORTED_FROM relationships)

query = """
CALL gds.pageRank.stream('ivory_import_graph',
                         {
                           maxIterations: $max_iterations,
                           dampingFactor: $damping_factor,
                           relationshipWeightProperty: 'weight'
                         }
                        )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS country, score AS page_rank
ORDER BY page_rank DESC, country ASC
"""

max_iterations = 20
damping_factor = 0.85

df_importer_pagerank = my_neo4j_run_query_pandas(
    query,
    max_iterations=max_iterations,
    damping_factor=damping_factor
)

print("\nTop importers by PageRank (ivory_import_graph):")
display(df_importer_pagerank.head(10))


Dropped existing GDS graph 'ivory_import_graph'
Projected ivory_import_graph into GDS:


Unnamed: 0,graphName,nodeCount,relationshipCount
0,ivory_import_graph,223,2885



Top importers by PageRank (ivory_import_graph):


Unnamed: 0,country,page_rank
0,CN,26.515825
1,HK,25.562085
2,ZW,18.045
3,BW,17.11916
4,GB,14.813849
5,ZM,6.814509
6,IN,6.460679
7,US,6.339017
8,ZA,5.748359
9,KE,5.255449


In [24]:
# Betweenness Centrality:
#    intermediate countries acting as middlemen in import flows


query = """
CALL gds.betweenness.stream('ivory_import_graph',
                            {relationshipWeightProperty: 'weight'})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS country, score AS betweenness
ORDER BY betweenness DESC, country ASC
"""

df_importer_betweenness = my_neo4j_run_query_pandas(query)

print("\nCountries with highest betweenness (middlemen in importer graph):")
display(df_importer_betweenness.head(10))




Countries with highest betweenness (middlemen in importer graph):


Unnamed: 0,country,betweenness
0,US,10045.473303
1,CH,7563.802943
2,DE,6032.622351
3,ZW,5611.924859
4,CA,5569.256746
5,NZ,5055.615127
6,GB,4000.526717
7,FR,3625.533889
8,ES,3244.432863
9,NL,3179.266789


In [25]:
# Degree Centrality:
#    countries with the most direct import/export links
#    (via IMPORTED_FROM)

query = """
CALL gds.degree.stream('ivory_import_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS country, score AS degree
ORDER BY degree DESC, country ASC
"""

df_importer_degree = my_neo4j_run_query_pandas(query)

print("\nCountries with highest degree (most direct IMPORTED_FROM connections):")
display(df_importer_degree.head(10))


Countries with highest degree (most direct IMPORTED_FROM connections):


Unnamed: 0,country,degree
0,US,168.0
1,GB,126.0
2,CA,104.0
3,CH,94.0
4,DE,94.0
5,FR,88.0
6,IT,74.0
7,AU,70.0
8,JP,68.0
9,ES,63.0


In [26]:
# Harmonic Centrality:
#    intermediate countries that are close to many others
#    (importer graph – unweighted harmonic centrality)


query = """
CALL gds.closeness.harmonic.stream('ivory_import_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS country,
       score AS harmonic_centrality
ORDER BY harmonic_centrality DESC, country ASC
"""

df_importer_harmonic = my_neo4j_run_query_pandas(query)

print("\nCountries with highest harmonic centrality (importer graph):")
display(df_importer_harmonic.head(10))



Countries with highest harmonic centrality (importer graph):


Unnamed: 0,country,harmonic_centrality
0,ZW,0.804054
1,ZA,0.716967
2,GB,0.688438
3,US,0.684685
4,FR,0.64039
5,DE,0.633634
6,CH,0.62988
7,MW,0.608108
8,BW,0.605856
9,TZ,0.596847


In [27]:
# For countries importing ivory, what countries do they want their ivory to originate from?
# (Using :IMPORTS_ORIGINATED_FROM relationships)

def importer_origin_preferences(top_n_per_importer=3):
    """
    For each importer, summarize which origin countries their imports
    are reported to originate from, using IMPORTS_ORIGINATED_FROM relationships.

    Returns:
      - full dataframe of (importer, origin, total_ivory, n_trades)
      - top_n_per_importer origins per importer (by total_ivory)
    """

    query = """
    MATCH (importer:Country)-[r:IMPORTS_ORIGINATED_FROM]->(origin:Country)
    RETURN
      importer.name AS importer,
      origin.name   AS origin,
      sum(r.weight) AS total_ivory,
      count(r)      AS n_trades
    ORDER BY importer, total_ivory DESC
    """

    df = my_neo4j_run_query_pandas(query)

    # Sort and keep top N origins per importer by total_ivory
    df_sorted = df.sort_values(["importer", "total_ivory"], ascending=[True, False])
    df_top = df_sorted.groupby("importer").head(top_n_per_importer).reset_index(drop=True)

    return df, df_top


# analysis
df_importer_origin_all, df_importer_origin_top = importer_origin_preferences(top_n_per_importer=3)

# for each importing country, which origin countries are the source of the ivory and how much ivory came from each origin.

print("All importer → origin pairs (aggregated):")
display(df_importer_origin_all.head(200))

print("\nTop 3 origin countries per importer (by total ivory volume):")
display(df_importer_origin_top.head(200))


All importer → origin pairs (aggregated):


Unnamed: 0,importer,origin,total_ivory,n_trades
0,AD,KE,415.0,1
1,AD,SD,127.0,1
2,AD,CF,92.0,1
3,AD,ZM,22.0,1
4,AD,CG,18.0,1
...,...,...,...,...
195,CA,AO,8.0,1
196,CA,MW,7.0,1
197,CA,GN,6.0,1
198,CA,BI,5.0,1



Top 3 origin countries per importer (by total ivory volume):


Unnamed: 0,importer,origin,total_ivory,n_trades
0,AD,KE,415.0,1
1,AD,SD,127.0,1
2,AD,CF,92.0,1
3,AE,TZ,407.0,1
4,AE,KE,59.0,1
...,...,...,...,...
195,MT,ZW,4.0,1
196,MU,KE,2.0,1
197,MU,ZW,1.0,1
198,MW,SG,100.0,1


In [28]:
# Top 10 importer → origin pairs by highest total ivory volume

def top10_importer_origin():
    """
    Returns the top 10 importer→origin ivory flows globally,
    based on the highest total ivory volumes.

    For each importer, we first select ONLY its single
    highest-volume origin country, then globally rank the top 10.
    """

    query = """
    MATCH (importer:Country)-[r:IMPORTS_ORIGINATED_FROM]->(origin:Country)
    RETURN
      importer.name AS importer,
      origin.name   AS origin,
      sum(r.weight) AS total_ivory
    ORDER BY importer, total_ivory DESC
    """

    df = my_neo4j_run_query_pandas(query)

    # Select top 1 origin per importer
    df_top1 = (
        df.sort_values(["importer", "total_ivory"], ascending=[True, False])
          .groupby("importer")
          .head(1)
          .reset_index(drop=True)
    )

    # Globally rank and pick top 10
    df_top10 = df_top1.sort_values("total_ivory", ascending=False).head(10)

    return df_top10


# importer analysis
df_importer_top10 = top10_importer_origin()

print("Top 10 importer → origin pairs by total ivory volume:")
display(df_importer_top10)


Top 10 importer → origin pairs by total ivory volume:


Unnamed: 0,importer,origin,total_ivory
53,HK,CF,43461560.0
129,US,CD,13717330.0
69,JP,CG,2266172.0
122,TH,ZW,1000000.0
107,RO,KE,721977.0
34,DE,SD,469645.0
35,DK,SD,425082.0
45,FR,SD,93076.33
46,GB,SD,81936.0
19,CA,CF,59066.0


In [29]:
my_neo4j_wipe_out_database()

Cleared neo4j database!
