In [1]:
import sys

MODULES_PATH = "../../modules"
if MODULES_PATH not in sys.path:
    sys.path.append(MODULES_PATH)
    
from database import *
from hive_wrapper import *
from utils import *
import pandas as pd

%reload_ext autoreload
%autoreload 2

In [2]:
username = "boukil"
hive_connection = HiveConnection(*get_env_vars())

In [3]:
# all edges table creation
query=f"""
    DROP TABLE IF EXISTS {username}.all_edges
"""
hive_connection.exec(query)

query = f"""
    CREATE EXTERNAL TABLE {username}.all_edges(
        trip_id string,
        trip_headsign string,
        route_id string,
        route_name string,
        route_type string,
        transport_type string,
        transport_subtype string,
        stop_id string,
        stop_name string,
        arrival_time string,
        departure_time string,
        stop_seqnum integer,
        parent_station string,
        next_stop_id string,
        next_stop_name string,
        next_arrival_time string,
        next_parent_station string,
        next_departure_time string,
        is_walkable boolean,
        duration_s double,
        waiting_time_s integer,
        node_id integer,
        next_node_id integer
    )
    STORED AS ORC
    LOCATION '/user/{username}/network_data/all_edges'
"""
hive_connection.exec(query)

In [4]:
query = f"""
    SELECT *
    FROM {username}.all_edges
"""
edges_df = hive_connection.pandas_df(query)

  self.cache[query] = pd.read_sql(query, self.conn)


In [5]:
len(edges_df)

377021

In [6]:
edges_df.head()

Unnamed: 0,all_edges.trip_id,all_edges.trip_headsign,all_edges.route_id,all_edges.route_name,all_edges.route_type,all_edges.transport_type,all_edges.transport_subtype,all_edges.stop_id,all_edges.stop_name,all_edges.arrival_time,...,all_edges.next_stop_id,all_edges.next_stop_name,all_edges.next_arrival_time,all_edges.next_parent_station,all_edges.next_departure_time,all_edges.is_walkable,all_edges.duration_s,all_edges.waiting_time_s,all_edges.node_id,all_edges.next_node_id
0,1.TA.91-4-A-j23-1.1.H,Zürich HB SZU,91-4-A-j23-1,s4,109,train,s,8503097:0:2,langnau-gattikon,17:40:00,...,8503096:0:3,adliswil,17:45:00,Parent8503096,17:45:00,False,300.0,0.0,221,222
1,1.TA.91-4-A-j23-1.1.H,Zürich HB SZU,91-4-A-j23-1,s4,109,train,s,8503096:0:3,adliswil,17:45:00,...,8503095:0:1,sood-oberleimbach,17:46:00,,17:46:00,False,60.0,0.0,222,1610
2,1.TA.91-4-A-j23-1.1.H,Zürich HB SZU,91-4-A-j23-1,s4,109,train,s,8503095:0:1,sood-oberleimbach,17:46:00,...,8503094:0:1,zürich leimbach,17:49:00,Parent8503094,17:49:00,False,180.0,0.0,1610,1401
3,1.TA.91-4-A-j23-1.1.H,Zürich HB SZU,91-4-A-j23-1,s4,109,train,s,8503094:0:1,zürich leimbach,17:49:00,...,8503093:0:1,zürich manegg,17:50:00,Parent8503093,17:50:00,False,60.0,0.0,1401,881
4,1.TA.91-4-A-j23-1.1.H,Zürich HB SZU,91-4-A-j23-1,s4,109,train,s,8503093:0:1,zürich manegg,17:50:00,...,8503086:0:1,zürich brunau,17:52:00,Parent8503086,17:52:00,False,120.0,0.0,881,1011


In [7]:
edges_df.columns = [c[10:] for c in edges_df.columns]
edges_df.columns

Index(['trip_id', 'trip_headsign', 'route_id', 'route_name', 'route_type',
       'transport_type', 'transport_subtype', 'stop_id', 'stop_name',
       'arrival_time', 'departure_time', 'stop_seqnum', 'parent_station',
       'next_stop_id', 'next_stop_name', 'next_arrival_time',
       'next_parent_station', 'next_departure_time', 'is_walkable',
       'duration_s', 'waiting_time_s', 'node_id', 'next_node_id'],
      dtype='object')

In [8]:
attributes = ['trip_id', 
              'trip_headsign',
              
              'route_name',
              
              'transport_type',
              'transport_subtype',
              
              'stop_id',
              'stop_name',
              
              'arrival_time',
              'departure_time',
              
              'next_stop_id',
              'next_stop_name',
              
              'next_arrival_time',
              'next_departure_time',
              
              'is_walkable',
              
              'duration_s',
              'waiting_time_s'
]

In [9]:
edges_df[["stop_id", "next_stop_id"]].head()#.isna().any()

Unnamed: 0,stop_id,next_stop_id
0,8503097:0:2,8503096:0:3
1,8503096:0:3,8503095:0:1
2,8503095:0:1,8503094:0:1
3,8503094:0:1,8503093:0:1
4,8503093:0:1,8503086:0:1


In [10]:
!pip install networkx



In [11]:
import networkx as nx

graph = nx.from_pandas_edgelist(edges_df, 
                                source='next_stop_id', 
                                target='stop_id', 
                                edge_attr=attributes, 
                                create_using=nx.MultiDiGraph)

graph.number_of_edges(), graph.number_of_nodes()

(377021, 2229)

In [13]:
import pickle
# Save the graph to a pickle file
with open("../../data/transport_network.pkl", "wb") as f:
    pickle.dump(graph, f)

In [14]:
import networkx as nx
import pickle
import datetime

# Load the graph from a pickle file
with open("../../data/transport_network.pkl", "rb") as f:
    graph = pickle.load(f)