In [15]:
import pandas as pd
from faker import Faker

In [16]:
df = pd.read_csv('https://s-cube-network.eu/c2k-files/c2k_data_comma.csv', dtype=str)
df

Unnamed: 0,nr,i1_legid,i1_rcs_p,i1_rcs_e,i1_dep_1_p,i1_dep_1_e,i1_dep_1_place,i1_rcf_1_p,i1_rcf_1_e,i1_rcf_1_place,...,o_dep_3_p,o_dep_3_e,o_dep_3_place,o_rcf_3_p,o_rcf_3_e,o_rcf_3_place,o_dlv_p,o_dlv_e,o_hops,legs
0,0,5182,199,218,210,215,609,935,736,256,...,?,?,?,?,?,?,780,434,1,2
1,1,6523,844,584,90,297,700,1935,1415,431,...,?,?,?,?,?,?,3870,445,1,2
2,2,5878,4380,4119,90,280,456,905,547,700,...,?,?,?,?,?,?,550,1520,1,1
3,3,1275,759,169,240,777,173,340,577,349,...,?,?,?,?,?,?,3780,159,1,1
4,4,8117,1597,1485,150,241,411,585,612,128,...,?,?,?,?,?,?,4140,4797,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3938,3939,4225,122,19,240,278,815,830,761,174,...,?,?,?,?,?,?,1665,1300,2,3
3939,3940,14017,2028,413,270,1825,605,2610,2535,349,...,?,?,?,?,?,?,3780,807,2,1
3940,3941,4660,1356,178,240,1359,815,760,716,609,...,?,?,?,?,?,?,5100,4381,2,1
3941,3942,6472,2692,1856,90,867,700,1060,1049,113,...,?,?,?,?,?,?,3780,945,2,2


In [17]:
LEGS = [1,2,3]
SEGMENTS = [1,2,3]
GOINGS = ['i','o']
SERVICES = ['rcs', 'dlv']

In [18]:
# Helper functions
def get_last_i1_rcf_place(row):
    for s in [3,2,1]:
        if row[f'i1_rcf_{s}_place'] != '?':
            return row[f'i1_rcf_{s}_place']
    raise Exception("cannot find last rcf place")

def get_last_outbound_rcf_place(row):
    for s in [3,2,1]:
        if row[f'o_rcf_{s}_place'] != '?':
            return row[f'o_rcf_{s}_place']
    raise Exception("cannot find last rcf place")

In [19]:
# assign last inbound & outbound rcf place
df['last_o_rcf_place'] = df.apply(get_last_outbound_rcf_place, axis =1)
df['last_i_rcf_place'] = df.apply(get_last_i1_rcf_place, axis =1)
df

Unnamed: 0,nr,i1_legid,i1_rcs_p,i1_rcs_e,i1_dep_1_p,i1_dep_1_e,i1_dep_1_place,i1_rcf_1_p,i1_rcf_1_e,i1_rcf_1_place,...,o_dep_3_place,o_rcf_3_p,o_rcf_3_e,o_rcf_3_place,o_dlv_p,o_dlv_e,o_hops,legs,last_o_rcf_place,last_i_rcf_place
0,0,5182,199,218,210,215,609,935,736,256,...,?,?,?,?,780,434,1,2,411,256
1,1,6523,844,584,90,297,700,1935,1415,431,...,?,?,?,?,3870,445,1,2,256,431
2,2,5878,4380,4119,90,280,456,905,547,700,...,?,?,?,?,550,1520,1,1,349,700
3,3,1275,759,169,240,777,173,340,577,349,...,?,?,?,?,3780,159,1,1,700,671
4,4,8117,1597,1485,150,241,411,585,612,128,...,?,?,?,?,4140,4797,2,1,411,166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3938,3939,4225,122,19,240,278,815,830,761,174,...,?,?,?,?,1665,1300,2,3,293,174
3939,3940,14017,2028,413,270,1825,605,2610,2535,349,...,?,?,?,?,3780,807,2,1,113,597
3940,3941,4660,1356,178,240,1359,815,760,716,609,...,?,?,?,?,5100,4381,2,1,737,609
3941,3942,6472,2692,1856,90,867,700,1060,1049,113,...,?,?,?,?,3780,945,2,2,635,113


In [20]:
# Load nodes and draw location at relationships
cols = df.columns.tolist()

airport_ids = set()
for col in cols:
    if 'place' in col:
        airport_ids.update([int(i) for i in df.loc[(df[col] != '?') & (df[col].notna()), col].unique().tolist()])

# Creating Names for the airports will help with human readability
Faker.seed(0)
fake = Faker()
def single_name_city():
    n = fake.city()
    while ' ' in n: # multi-word names are a bit of a mouthful
        n = fake.city()
    return n

airport_df = pd.DataFrame([{'id':airport_id , 'name':single_name_city()} for airport_id in airport_ids])
airport_df.to_csv('airports.csv',index=False)
airport_df

Unnamed: 0,id,name
0,514,Changchester
1,515,Hullport
2,520,Howardborough
3,524,Ramoshaven
4,527,Bryanside
...,...,...
232,500,Dennisville
233,502,Gabrielchester
234,504,Fordland
235,508,Gardnerside


In [21]:
shipment_df = df[['nr']].drop_duplicates().rename(columns={'nr':'id'})
shipment_df.to_csv('shipments.csv',index=False)
shipment_df

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
...,...
3938,3939
3939,3940
3940,3941
3941,3942


In [22]:
# (:DeparturePoint)-[r:TRANSPORT]-(:ArrivalWarehouse) inbound
transport_records = []
seq_id=0
for l in LEGS:
    print(f'== LEG {l} ======================')
    for s in SEGMENTS:
        print(f'-- SEGMENT {s} ----------------------')
        sub_dict = df.loc[(df[f'i{l}_rcf_{s}_place'] != '?') & (df[f'i{l}_legid'].notna()),
        ['nr', f'i{l}_legid', f'i{l}_rcf_{s}_p', f'i{l}_rcf_{s}_e', f'i{l}_dep_{s}_place', f'i{l}_rcf_{s}_place']] \
            .to_dict('records')
        for rec in sub_dict:
            transport_records.append({
                'id': seq_id,
                'shipment_id':rec['nr'],
                'leg_id':rec[f'i{l}_legid'],
                'leg_number':l,
                'segment_number':s,
                'planned_minutes':rec[f'i{l}_rcf_{s}_p'],
                'effective_minutes':rec[f'i{l}_rcf_{s}_e'],
                'origin_airport_id':rec[f'i{l}_dep_{s}_place'],
                'dest_airport_id':rec[f'i{l}_rcf_{s}_place'], 
            }) 
            seq_id+=1
transport_df = pd.DataFrame(transport_records)
transport_df.to_csv('transports.csv',index=False)
transport_df

-- SEGMENT 1 ----------------------
-- SEGMENT 2 ----------------------
-- SEGMENT 3 ----------------------
-- SEGMENT 1 ----------------------
-- SEGMENT 2 ----------------------
-- SEGMENT 3 ----------------------
-- SEGMENT 1 ----------------------
-- SEGMENT 2 ----------------------
-- SEGMENT 3 ----------------------


Unnamed: 0,id,shipment_id,leg_id,leg_number,segment_number,planned_minutes,effective_minutes,origin_airport_id,dest_airport_id
0,0,0,5182,1,1,935,736,609,256
1,1,1,6523,1,1,1935,1415,700,431
2,2,2,5878,1,1,905,547,456,700
3,3,3,1275,1,1,340,577,173,349
4,4,4,8117,1,1,585,612,411,128
...,...,...,...,...,...,...,...,...,...
10349,10349,1313,11850,3,3,870,833,610,727
10350,10350,1801,7245,3,3,1206,1026,144,555
10351,10351,2985,568,3,3,2250,943,431,356
10352,10352,3259,9425,3,3,1200,1023,798,809
