In [1]:
import os
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

username = os.environ['RENKU_USERNAME']
hiveaddr = os.environ['HIVE_SERVER2']
(hivehost,hiveport) = hiveaddr.split(':')
print("Operating as: {0}".format(username))

Operating as: ceraolo


In [2]:
from pyhive import hive

# Create the connection
conn = hive.connect(host=hivehost, 
                    port=hiveport,
                    username=username) 
# Create the cursor
cur = conn.cursor()

In [3]:
### Create your database if it does not exist
query = """
CREATE DATABASE IF NOT EXISTS {0} LOCATION '/group/five-guys/hive'
""".format(username)
cur.execute(query)

In [4]:
def retrieve_df_connections(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        departure_id string,
        arrival_id string,
        departure_time string,
        arrival_time string,
        trip_id string,
        monday string,
        tuesday string,
        wednesday string,
        thursday string,
        friday string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    df["departure_time"] = df["departure_time"].apply(lambda x: int(pd.Timestamp(x).timestamp()))

    df["arrival_time"] = df["arrival_time"].apply(lambda x: int(pd.Timestamp(x).timestamp()))
    
    df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday']] = df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday']].replace({'0':False, '1':True})

    result = list(df.itertuples(index=False, name=None))
    
    del df
    
    return result



In [5]:
conn_table = retrieve_df_connections("conn_table")

In [None]:
import pickle
path_data = "../data/"
pickle.dump(conn_table, open(path_data+"connections_data.pickle", "wb"))

In [None]:
def retrieve_df_trips(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        departure_id string,
        arrival_id string,
        departure_time string,
        arrival_time string,
        trip_id string,
        monday string,
        tuesday string,
        wednesday string,
        thursday string,
        friday string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]

    routes_tot_filtered_map = df.groupby('trip_id')[['route_id', 'service_id', 'trip_headsign', 'trip_short_name', 'direction_id']].apply(lambda g: g.values.tolist()).to_dict()
    names_cols = ["route_id", "service_id", "trip_headsign", "trip_short_name", "direction_id"]

    map_copied = routes_tot_filtered_map.copy()

    for k,v in routes_tot_filtered_map.items():
        newdict = {}
        for ind, element in enumerate(v[0]):
            newdict[names_cols[ind]] = element
        map_copied[k] = newdict
    
    del df
    
    return map_copied




In [None]:
trips = retrieve_df_trips("trips")

In [None]:
pickle.dump(trips, open(path_data+"trips.pickle", "wb"))

In [None]:
def retrieve_df_stops(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        departure_id string,
        arrival_id string,
        departure_time string,
        arrival_time string,
        trip_id string,
        monday string,
        tuesday string,
        wednesday string,
        thursday string,
        friday string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    close_stops_map = df.groupby('stop_id')[['stop_name', 'stop_lat', 'stop_lon', 'parent_station']].apply(lambda g: g.values.tolist()).to_dict()
    names_cols = ['stop_name', 'stop_lat', 'stop_lon', 'parent_station']

    stops_final = close_stops_map.copy()

    for k,v in close_stops_map.items():
        newdict = {}
        for ind, element in enumerate(v[0]):
            newdict[names_cols[ind]] = element
            if names_cols[ind] == "parent_station" and isinstance(element, str):
                if element[:6]=="Parent":
                    element = element[6:]
                    newdict[names_cols[ind]] = element
                if element[-1]=="P":
                    element = element[:-1]
                    newdict[names_cols[ind]] = element
                if element != str(int(element)):
                    print(element)
        stops_final[k] = newdict

    
    
    del df
    
    return stops_final





In [None]:
stops = retrieve_df_stops("stops")

In [None]:
pickle.dump(trips, open(path_data+"stops.pickle", "wb"))