<div class="alert alert-block alert-warning">
    <b>WARNING! </b> This notebook will read the file in the HDFS and create pickle files. Since we already generated them for you, there is no need to run this notebook unless you want to verify our solution from scratch. <br>
    To run the notebook without the risk of overwriting, we set a flag called "overwrite" to false. If you want to recreate the pickles, set it to true.
</div>

# Hive extraction

In [1]:
overwrite = False

In [2]:
import os
import pandas as pd
pd.set_option("display.max_columns", 50)
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

#username = os.environ['RENKU_USERNAME']
username = "eric"
hiveaddr = os.environ['HIVE_SERVER2']
(hivehost,hiveport) = hiveaddr.split(':')
print("Operating as: {0}".format(username))

Operating as: eric


In [3]:
from pyhive import hive

# Create the connection
conn = hive.connect(host=hivehost, 
                    port=hiveport,
                    username=username) 
# Create the cursor
cur = conn.cursor()

In [4]:
### Create your database if it does not exist
query = """
CREATE DATABASE IF NOT EXISTS {0} LOCATION '/group/five-guys/hive'
""".format(username)
cur.execute(query)

## Connections

In [5]:
def retrieve_df_connections(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        departure_id string,
        arrival_id string,
        departure_time string,
        arrival_time string,
        trip_id string,
        monday string,
        tuesday string,
        wednesday string,
        thursday string,
        friday string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    df["departure_time"] = df["departure_time"].apply(lambda x: int(pd.Timestamp(x).timestamp()))

    df["arrival_time"] = df["arrival_time"].apply(lambda x: int(pd.Timestamp(x).timestamp()))
    
    df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday']] = df[['monday', 'tuesday', 'wednesday', 'thursday', 'friday']].replace({'0':False, '1':True})

    result = list(df.itertuples(index=False, name=None))
    
    del df
    
    return result

In [6]:
conn_table = retrieve_df_connections("conn_table")

In [7]:
import pickle
path_data = "../../data/"

if overwrite:
    pickle.dump(conn_table, open(path_data+"connections_data.pickle", "wb"))
else: 
    print("No overwrite, flag set to false")

No overwrite, flag set to false


## Trips

In [8]:
def retrieve_df_trips(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        route_id string,
        service_id string,
        trip_id string,
        trip_headsign string,
        trip_short_name string,
        direction_id string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]

    return df

In [9]:
def process_trips(df):
    routes_tot_filtered_map = df.groupby('trip_id')[['route_id', 'service_id', 'trip_headsign', 'trip_short_name', 'direction_id']].apply(lambda g: g.values.tolist()).to_dict()
    names_cols = ["route_id", "service_id", "trip_headsign", "trip_short_name", "direction_id"]

    map_copied = routes_tot_filtered_map.copy()

    for k,v in routes_tot_filtered_map.items():
        newdict = {}
        for ind, element in enumerate(v[0]):
            newdict[names_cols[ind]] = element
        map_copied[k] = newdict
    return map_copied

In [10]:
trips_table = retrieve_df_trips("trips")

In [11]:
trips = process_trips(trips_table)

In [12]:
import pickle
path_data = "../../data/"

if overwrite:
    pickle.dump(trips, open(path_data+"trips.pickle", "wb"))
else: 
    print("No overwrite, flag set to false")

No overwrite, flag set to false


## Stops

In [13]:
def retrieve_df_stops(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        stop_id string,
        stop_name string,
        stop_lat string,
        stop_lon string,
        parent_station string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    return df

In [14]:
stops = retrieve_df_stops("stops_table")

In [15]:
def process_stops(df):
    close_stops_map = df.groupby('stop_id')[['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']].apply(lambda g: g.values.tolist()).to_dict()
    names_cols = ['stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station']
    
    stops_final = close_stops_map.copy()

    for k,v in close_stops_map.items():
        newdict = {}
        for ind, element in enumerate(v[0]):
            newdict[names_cols[ind]] = element
            if names_cols[ind] == "parent_station" and isinstance(element, str) and element!="":
                element = element.replace('Parent','').replace('P','')
                newdict[names_cols[ind]] = element
                if element != str(int(element)):
                    print(element)
        stops_final[k] = newdict
    
    return stops_final

In [16]:
stops_dict = process_stops(stops)

In [17]:
import pickle
path_data = "../../data/"

if overwrite:
    pickle.dump(stops_dict, open(path_data+"stops.pickle", "wb"))
else: 
    print("No overwrite, flag set to false")

No overwrite, flag set to false


## Footpath

In [18]:
def retrieve_df_footpath(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        departure_id string,
        arrival_id string,
        `time` int
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    footpath_map = df.groupby('arrival_id')[['departure_id', 'time']].apply(lambda g: g.values.tolist()).to_dict()
    footpath_map = {k: dict(v) for k, v in footpath_map.items()}
    
    
    del df
    
    return footpath_map






In [19]:
footpath = retrieve_df_footpath("footpath")

In [20]:
import pickle
path_data = "../../data/"

if overwrite:
   pickle.dump(footpath, open(path_data+"footpath.pickle", "wb"))
else: 
    print("No overwrite, flag set to false")

No overwrite, flag set to false


## Routes

In [21]:
def retrieve_df_routes(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
        trip_id string,
        routes_desc string,
        route_id string
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT *
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
    df.columns = [column.split('.')[1] for column in df.columns]
    
    return df

In [22]:
routes = retrieve_df_routes("routes")

In [23]:
import pickle
path_data = "../../data/"

if overwrite:
   pickle.dump(routes, open(path_data+"routes.pickle", "wb"))
else: 
    print("No overwrite, flag set to false")

No overwrite, flag set to false


## Confidence

In [24]:
def retrieve_df_confidence(table_name):
    ### Create your database if it does not exist
    
    query = """
    DROP TABLE IF EXISTS {0}
    """.format(table_name)
    cur.execute(query)
    
    ### Creation of the table
    query = """
    CREATE EXTERNAL TABLE {0}(
       DEPARTURE_ID string,
       ARRIVAL_ID string,
       DAY_OF_WEEK int,
       MAX_ARRIVAL_DELAY float,
       CUMULATIVE float
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ';'
    STORED AS TEXTFILE
    location '/group/five-guys/{0}'
    tblproperties ("skip.header.line.count"="1")
    """.format(table_name)
    cur.execute(query)
    
    query = """
    SELECT DEPARTURE_ID,
           ARRIVAL_ID,
           DAY_OF_WEEK,
           MAX_ARRIVAL_DELAY,
           CUMULATIVE
    FROM {0}
    """.format(table_name)
    df = pd.read_sql(query, conn)
#   df.columns = [column.split('.')[1] for column in df.columns]

    confidence = list(df.itertuples(index=False, name=None))
    
    del df
    
    return confidence

In [25]:
confidence = retrieve_df_confidence("confidence")

In [26]:
from collections import defaultdict
main_dict = defaultdict(list)
for c in confidence:
    main_dict[c[0]].append(c[1:])
#print(main_dict)


main_dict_final = dict()
for key, value in main_dict.items():
    inner_dict = defaultdict(list)
    for inner in value:
        inner_dict[inner[0]].append(inner[1:])
    main_dict_final[key] = dict(inner_dict)

main_dict_final['8503202']['8502209']

lenght = 0
for key, value in main_dict_final.items():
    for key2, value2 in value.items():
        lenght += len(value[key2])
        
print(lenght)

453366


In [27]:
import pickle
path_data = "../../data/"

if overwrite:
   pickle.dump(main_dict_final, open(path_data+"confidence.pickle", "wb"))
else: 
    print("No overwrite, flag set to false")

No overwrite, flag set to false
