In [1]:
import os
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

#username = os.environ['RENKU_USERNAME']
username = "eric"
hiveaddr = os.environ['HIVE_SERVER2']
(hivehost,hiveport) = hiveaddr.split(':')
print("Operating as: {0}".format(username))

Operating as: eric


In [2]:
from pyhive import hive

# Create the connection
conn = hive.connect(host=hivehost, 
                    port=hiveport,
                    username=username) 
# Create the cursor
cur = conn.cursor()

In [3]:
### Create your database if it does not exist
query = """
CREATE DATABASE IF NOT EXISTS {0} LOCATION '/group/five-guys/hive'
""".format(username)
cur.execute(query)

## Connections

In [23]:
def retrieve_df_connections(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        departure_id string,
        arrival_id string,
        departure_time string,
        arrival_time string,
        trip_id string,
        monday string,
        tuesday string,
        wednesday string,
        thursday string,
        friday string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}.csv
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    df["departure_time"] = df["departure_time"].apply(lambda x: int(pd.Timestamp(x).timestamp()))

    df["arrival_time"] = df["arrival_time"].apply(lambda x: int(pd.Timestamp(x).timestamp()))
    
    df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday']] = df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday']].replace({'0':False, '1':True})

    result = list(df.itertuples(index=False, name=None))
    
    del df
    
    return result



In [None]:
conn_table = retrieve_df_connections("conn_table")

In [None]:
import pickle
path_data = "../../data/"
pickle.dump(conn_table, open(path_data+"connections_data.pickle", "wb"))

## Trips

In [70]:
def retrieve_df_trips(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        route_id string,
        service_id string,
        trip_id string,
        trip_headsign string,
        trip_short_name string,
        direction_id string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]


    
    
    return df




In [51]:
def process_trips(df):
    routes_tot_filtered_map = df.groupby('trip_id')[['route_id', 'service_id', 'trip_headsign', 'trip_short_name', 'direction_id']].apply(lambda g: g.values.tolist()).to_dict()
    names_cols = ["route_id", "service_id", "trip_headsign", "trip_short_name", "direction_id"]

    map_copied = routes_tot_filtered_map.copy()

    for k,v in routes_tot_filtered_map.items():
        newdict = {}
        for ind, element in enumerate(v[0]):
            newdict[names_cols[ind]] = element
        map_copied[k] = newdict
    return map_copied

In [71]:
trips_table = retrieve_df_trips("trips")

In [72]:
trips_table

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id
0,1-217-j19-1,TA+b0001,9.TA.1-217-j19-1.1.H,"Affoltern a. A., Bahnhof",21719,0
1,1-217-j19-1,TA+b0001,15.TA.1-217-j19-1.1.H,"Affoltern a. A., Bahnhof",21731,0
2,1-217-j19-1,TA+b0001,17.TA.1-217-j19-1.1.H,"Affoltern a. A., Bahnhof",21735,0
3,1-217-j19-1,TA+b0001,19.TA.1-217-j19-1.1.H,"Affoltern a. A., Bahnhof",21739,0
4,1-217-j19-1,TA+b0001,21.TA.1-217-j19-1.1.H,"Affoltern a. A., Bahnhof",21743,0
...,...,...,...,...,...,...
40391,80-53-Y-j19-1,TA+b0tui,50.TA.80-53-Y-j19-1.5.H,Chur,3479,0
40392,80-53-Y-j19-1,TA+b0tuk,54.TA.80-53-Y-j19-1.16.H,Chur,3481,0
40393,80-55-Y-j19-1,TA+b0tvm,25.TA.80-55-Y-j19-1.12.H,Zürich HB,3464,0
40394,80-55-Y-j19-1,TA+b0tvn,33.TA.80-55-Y-j19-1.12.H,Zürich HB,3468,0


In [73]:
trips = process_trips(trips_table)

In [75]:
import pickle
path_data = "../../data/"
pickle.dump(trips, open(path_data+"trips.pickle", "wb"))

## Stops

In [4]:
def retrieve_df_stops(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        stop_id string,
        stop_name string,
        stop_lat string,
        stop_lon string,
        parent_station string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    return df

In [5]:
stops = retrieve_df_stops("stops_table")
stops

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,parent_station
0,8506895,"Lufingen, Dorf",47.4886612950571,8.59365231835008,
1,8591190,"Zürich, Heuried",47.3694098744442,8.50635403902719,
2,8573729,"Bonstetten, Isenbach",47.3151613074637,8.47171500166672,
3,8502508,"Spreitenbach, Raiacker",47.4163939893986,8.37617917118731,
4,8589111,"Horgen, Gumelenstrasse",47.2608813777694,8.59208026660265,
...,...,...,...,...,...
1954,8573261,"Niederglatt ZH, Nöschikon",47.4953866910793,8.49499933383434,
1955,8588311,"Effretikon, Brüttenerstrasse",47.4319589521938,8.68603506218179,
1956,8590772,"Rüschlikon, Belvoir",47.3070423608591,8.54865570576227,
1957,8573718:0:D,"Birmensdorf ZH, Bahnhof",47.3569422650859,8.438800729652,Parent8573718


In [6]:
close_stops_map = stops.groupby('stop_id')[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']].apply(lambda g: g.values.tolist()).to_dict()
names_cols = ['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']

In [7]:
def process_stops(df):
    close_stops_map = df.groupby('stop_id')[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']].apply(lambda g: g.values.tolist()).to_dict()
    names_cols = ['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']
    
    stops_final = close_stops_map.copy()

    for k,v in close_stops_map.items():
        newdict = {}
        for ind, element in enumerate(v[0]):
            newdict[names_cols[ind]] = element
            if names_cols[ind] == "parent_station" and isinstance(element, str) and element!="":
                element = element.replace('Parent','').replace('P','')
                newdict[names_cols[ind]] = element
                if element != str(int(element)):
                    print(element)
        stops_final[k] = newdict
    
    return stops_final

In [8]:
stops_dict = process_stops(stops)

In [18]:
stops=set(list(stops_dict.keys()))

In [9]:
import pickle
path_data = "../../data/"
pickle.dump(stops_dict, open(path_data+"stops.pickle", "wb"))

## Footpath

In [4]:
def retrieve_df_footpath(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        departure_id string,
        arrival_id string,
        `time` int
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    footpath_map = df.groupby('arrival_id')[['departure_id', 'time']].apply(lambda g: g.values.tolist()).to_dict()
    footpath_map = {k: dict(v) for k, v in footpath_map.items()}
    
    
    del df
    
    return footpath_map






In [None]:
footpath = retrieve_df_footpath("footpath")

In [None]:
pickle.dump(footpath, open(path_data+"footpath.pickle", "wb"))

In [76]:

#TO GET OBJECTS FROM PICKLE
file = open(path_data+"trips.pickle",'rb')
object_file = pickle.load(file)
file.close()
#object_file

In [10]:
def retrieve_df_routes(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        trip_id string,
        routes_desc string,
        route_id string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    return df







In [11]:
routes = retrieve_df_routes("routes")
routes.head()

Unnamed: 0,trip_id,routes_desc,route_id
0,9.TA.1-217-j19-1.1.H,Bus,1-217-j19-1
1,15.TA.1-217-j19-1.1.H,Bus,1-217-j19-1
2,17.TA.1-217-j19-1.1.H,Bus,1-217-j19-1
3,19.TA.1-217-j19-1.1.H,Bus,1-217-j19-1
4,21.TA.1-217-j19-1.1.H,Bus,1-217-j19-1


In [12]:
import pickle
path_data = "../../data/"
pickle.dump(routes, open(path_data+"routes.pickle", "wb"))

## Confidence

In [13]:
def retrieve_df_confidence(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
       DEPARTURE_ID string,
       ARRIVAL_ID string,
       DAY_OF_WEEK int,
       MAX_ARRIVAL_DELAY float,
       CUMULATIVE float
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT DEPARTURE_ID,
           ARRIVAL_ID,
           DAY_OF_WEEK,
           MAX_ARRIVAL_DELAY,
           CUMULATIVE
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
#   df.columns = [column.split('.')[1] for column in df.columns]

    confidence = list(df.itertuples(index=False, name=None))
    
    del df
    
    return confidence

In [14]:
confidence = retrieve_df_confidence("confidence")

In [22]:
[el for el in confidence if (el[0] not in stops) or (el[1] not in stops)]

[]

In [55]:
tmp1 = [(c[0], tuple(list(c)[1:])) for c in confidence]

In [25]:
from collections import defaultdict
main_dict = defaultdict(list)
for c in confidence:
    main_dict[c[0]].append(c[1:])
#print(main_dict)


main_dict_final = dict()
for key, value in main_dict.items():
    inner_dict = defaultdict(list)
    for inner in value:
        inner_dict[inner[0]].append(inner[1:])
    main_dict_final[key] = dict(inner_dict)

main_dict_final['8503202']['8502209']

lenght = 0
for key, value in main_dict_final.items():
    for key2, value2 in value.items():
        lenght += len(value[key2])
        
print(lenght)

332647


In [37]:
'8503124' in stops

False

In [36]:
'8503124' in main_dict_final.keys()

False

In [30]:
[el for el in list(main_dict_final.keys()) if el not in stops]

[]

In [28]:
import pickle
path_data = "../../data/"
pickle.dump(main_dict_final, open(path_data+"confidence.pickle", "wb"))

In [29]:
with open('../../data/confidence.pickle', 'rb') as f:
    confidences = pickle.load(f)
    