<table border="0" style="width:100%">
 <tr>
    <td>
        <img src="https://static-frm.ie.edu/university/wp-content/uploads/sites/6/2022/06/IE-University-logo.png" width=150>
     </td>
    <td><div style="font-family:'Courier New'">
            <div style="font-size:25px">
                <div style="text-align: right"> 
                    <b> MASTER IN BIG DATA</b>
                    <br>
                    Python for Data Analysis II
                    <br><br>
                    <em> Daniel Sierra Ramos </em>
                </div>
            </div>
        </div>
    </td>
 </tr>
</table>

In [8]:
import json
import zipfile
import geopandas

import numpy as np
import pandas as pd

import plotly.express as px

In [7]:
!pip install geopandas


Collecting geopandas
  Downloading geopandas-0.12.2-py3-none-any.whl (1.1 MB)
Collecting shapely>=1.7
  Downloading shapely-2.0.1-cp39-cp39-win_amd64.whl (1.4 MB)
Collecting pyproj>=2.6.1.post1
  Downloading pyproj-3.4.1-cp39-cp39-win_amd64.whl (4.8 MB)
Collecting fiona>=1.8
  Downloading Fiona-1.9.1-cp39-cp39-win_amd64.whl (22.0 MB)
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting munch>=2.3.2
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: munch, cligj, click-plugins, shapely, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.9.1 geopandas-0.12.2 munch-2.5.0 pyproj-3.4.1 shapely-2.0.1


## **Streamlit** App with Uber data

Build a Streamlit app to represent some charts about using public Uber data. This data contains information about the average travel time of Uber rides between all neighborhoods in MAdrid city.

The app must have the following characteristics
 - Main frame must show 3 charts given a `SOURCE` and `DESTINATION`:
    1. Time series of the average travel time
    2. Bar chart of the average travel time for every weekday, and period of the day
    3. **(Optional)** Choropleth map of Madrid city, with the source and destination highlighted
 - A side bar containing two box selectors to select the `SOURCE` and `DESTINATION`
 - Every time a source or a destination is selected, the charts of the main frame should be updated accordingly.

 NOTE: I recommend you to build the charts first here in the notebook, and then copy the code to the Streamlit app.

### Load data

Use the following function to load the data. For the first two figures you will need just the `data` variable. For the choropleth map you will need the `codes` variable and the `geopandas` library to work with maps (this is optional). 

In [9]:
def read_and_preprocess_data():
    
    with zipfile.ZipFile('data/uber-data.zip') as zip:
        with zip.open('madrid-barrios-2020-1-All-DatesByHourBucketsAggregate.csv') as csv:
            data = pd.read_csv(csv)
        with zip.open('madrid_barrios.json') as geojson:
            codes = geopandas.read_file(geojson, encoding="utf-8")

    # change data types in codes because they are not the same as in data
    codes['GEOCODIGO'] = codes['GEOCODIGO'].astype(int)
    codes['MOVEMENT_ID'] = codes['MOVEMENT_ID'].astype(int)

    codes["DISPLAY_NAME"] = codes["DISPLAY_NAME"].str.split().str[1:].str.join(" ")

    # Merge the data with the codes (source)
    data = data.merge(codes[["GEOCODIGO","MOVEMENT_ID","DISPLAY_NAME"]], left_on="sourceid", right_on="MOVEMENT_ID", how="left")
    data = data.rename(columns={"GEOCODIGO":"src_neigh_code", "DISPLAY_NAME":"src_neigh_name"}).drop(columns=["MOVEMENT_ID"])

    data = data.merge(codes[["GEOCODIGO","MOVEMENT_ID","DISPLAY_NAME"]], left_on="dstid", right_on="MOVEMENT_ID", how="left")
    data = data.rename(columns={"GEOCODIGO":"dst_neigh_code", "DISPLAY_NAME":"dst_neigh_name"}).drop(columns=["MOVEMENT_ID"])

    # Create a new date column
    data["year"] = "2020"
    data["date"] = pd.to_datetime(data["day"].astype(str)+data["month"].astype(str)+data["year"].astype(str)+":"+data["start_hour"].astype(str), format="%d%m%Y:%H")

    # Create a new day_period column
    data["day_period"] = data.start_hour.astype(str) + "-" + data.end_hour.astype(str)
    data["day_of_week"] = data.date.dt.weekday
    data["day_of_week_str"] = data.date.dt.day_name()

    return data, codes

### Travel by Time and Day

In [10]:
data, codes = read_and_preprocess_data()

In [11]:
data

Unnamed: 0,sourceid,dstid,month,day,start_hour,end_hour,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,src_neigh_code,src_neigh_name,dst_neigh_code,dst_neigh_name,year,date,day_period,day_of_week,day_of_week_str
0,75,12,1,25,10,16,838.25,324.30,776.22,1.49,79123,San Fermín,79026,Palos de Moguer,2020,2020-01-25 10:00:00,10-16,5,Saturday
1,20,19,1,23,7,10,567.20,278.07,504.47,1.63,79041,Recoletos,79036,Niño Jesús,2020,2020-01-23 07:00:00,7-10,3,Thursday
2,52,9,2,8,0,7,470.88,268.89,412.66,1.64,79091,Casa de Campo,79023,La Chopera,2020,2020-02-08 00:00:00,0-7,5,Saturday
3,20,19,2,13,7,10,658.88,569.66,524.05,1.82,79041,Recoletos,79036,Niño Jesús,2020,2020-02-13 07:00:00,7-10,3,Thursday
4,14,25,1,26,10,16,519.17,240.26,478.95,1.46,79031,Pacífico,79046,Castellana,2020,2020-01-26 10:00:00,10-16,6,Sunday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285180,67,77,3,19,10,16,363.96,340.48,290.16,1.82,79112,Opañel,79125,Moscardó,2020,2020-03-19 10:00:00,10-16,3,Thursday
2285181,124,14,2,20,10,16,749.20,221.83,721.61,1.30,79206,Rejas,79031,Pacífico,2020,2020-02-20 10:00:00,10-16,3,Thursday
2285182,102,97,2,23,10,16,107.33,101.04,82.18,1.93,79162,Piovera,79156,San Juan Bautista,2020,2020-02-23 10:00:00,10-16,6,Sunday
2285183,106,57,3,13,10,16,920.80,258.01,886.86,1.31,79166,Valdefuentes,79096,El Plantío,2020,2020-03-13 10:00:00,10-16,4,Friday


In [12]:
SOURCES = sorted(data.src_neigh_name.unique())
DESTINATIONS =sorted(data.dst_neigh_name.unique())

In [13]:
selected_source = "Adelfas"
selected_dest = "Amposta"

In [14]:
selected_data = data[(data.src_neigh_name == selected_source) & (data.dst_neigh_name == selected_dest) ]
selected_data = sorted(selected_data)

In [None]:
import zipfile
import geopandas
import pandas as pd
import plotly.express as px
import streamlit as st


def read_and_preprocess_data():
    with zipfile.ZipFile('data/uber-data.zip') as zip:
        with zip.open('madrid-barrios-2020-1-All-DatesByHourBucketsAggregate.csv') as csv:
            data = pd.read_csv(csv)
        with zip.open('madrid_barrios.json') as geojson:
            codes = geopandas.read_file(geojson, encoding="utf-8")

    # change data types in codes because they are not the same as in data
    codes['GEOCODIGO'] = codes['GEOCODIGO'].astype(int)
    codes['MOVEMENT_ID'] = codes['MOVEMENT_ID'].astype(int)

    codes["DISPLAY_NAME"] = codes["DISPLAY_NAME"].str.split().str[1:].str.join(" ")

    # Merge the data with the codes (source)
    data = data.merge(codes[["GEOCODIGO", "MOVEMENT_ID", "DISPLAY_NAME"]], left_on="sourceid", right_on="MOVEMENT_ID",
                      how="left")
    data = data.rename(columns={"GEOCODIGO": "src_neigh_code", "DISPLAY_NAME": "src_neigh_name"}).drop(
        columns=["MOVEMENT_ID"])

    data = data.merge(codes[["GEOCODIGO", "MOVEMENT_ID", "DISPLAY_NAME"]], left_on="dstid", right_on="MOVEMENT_ID",
                      how="left")
    data = data.rename(columns={"GEOCODIGO": "dst_neigh_code", "DISPLAY_NAME": "dst_neigh_name"}).drop(
        columns=["MOVEMENT_ID"])

    # Create a new date column
    data["year"] = "2020"
    data["date"] = pd.to_datetime(
        data["day"].astype(str) + data["month"].astype(str) + data["year"].astype(str) + ":" + data[
            "start_hour"].astype(str), format="%d%m%Y:%H")

    # Create a new day_period column
    data["day_period"] = data.start_hour.astype(str) + "-" + data.end_hour.astype(str)
    data["day_of_week"] = data.date.dt.weekday
    data["day_of_week_str"] = data.date.dt.day_name()

    return data, codes


data, codes = read_and_preprocess_data()

st.title("Uber Travel Data in Madrid")
st.text("This Streamlit app shows the average travel time between two neighborhoods in Madrid.")
st.text("Further it shows a map of the chosen destination and origin")

st.sidebar.title("Filters")
st.sidebar.header("Select Origin and Destination")
st.sidebar.text("Please set the origin and destination to see the results")

# Using object notation
add_selectbox = st.sidebar.selectbox(
    "Choose origin",
    (data.src_neigh_name.sort_values().unique().tolist())
)

add_selectbox2 = st.sidebar.selectbox(
    "Choose destination",
    (data.dst_neigh_name.sort_values().unique().tolist())
)

with st.spinner('Loading Data...'):
    # Filter the data
    data_filtered = data[(data.src_neigh_name == add_selectbox) & (data.dst_neigh_name == add_selectbox2)]

    # group by date
    data_grouped = data_filtered.groupby("date").agg({"mean_travel_time": "mean"}).reset_index()

    data_grouped.mean_travel_time = data_grouped.mean_travel_time.astype(int) / 60

    # plot
    fig1 = px.line(data_grouped, x="date", y="mean_travel_time", title="Average Travel Time per Day",
                   labels={"mean_travel_time": "Mean travel time (minutes)", "date": "Date"})
    st.plotly_chart(fig1, use_container_width=True)

with st.spinner("Loading Data..."):
    data_grouped = data.groupby(["day_of_week_str", "day_period"]).agg({"mean_travel_time": "mean"}).reset_index()
    data_grouped.mean_travel_time = data_grouped.mean_travel_time.astype(int) / 60

    fig2 = px.bar(data_grouped, x="day_of_week_str", y="mean_travel_time", color="day_period", facet_col="day_period",
                  title="Average Travel Time per Day Period and Day of week",
                  labels={"mean_travel_time": "Mean travel time (minutes)",
                          "day_period": "Day Period", "day_of_week_str": "Day of Week"})
    st.plotly_chart(fig2, use_container_width=True)

with st.spinner("Loading Data..."):
    # Filter the GeoDataFrame to select two points
    gdf = codes[(codes.DISPLAY_NAME == add_selectbox) | (codes.DISPLAY_NAME == add_selectbox2)]

    fig3 = px.choropleth_mapbox(gdf, geojson=gdf.geometry, locations=gdf.index, hover_data=['DISPLAY_NAME'],
                                color_continuous_scale='OrRd', mapbox_style='open-street-map', zoom=10,
                                center={'lat': gdf.geometry.centroid.y.mean(), 'lon': gdf.geometry.centroid.x.mean()})

    # Show the map
    st.header("Map of Madrid")
    st.plotly_chart(fig3, use_container_width=True)

In [None]:
def main():
    st.title("Welcome to my uber app")
    data, codes =utils.read_and_preprocess_data()
    SOURCES = sorted(data.src_neigh_name.unique())
    DESTINATIONS =sorted(data.dst_neigh_name.unique())
    selected_source = st.sidebar.selectbox("Select the source",SOURCES)
    selected_dest = st.sidebar.selectbox("Select the destination",DESTINATIONS)
    
    
    selected_data = data[(data.src_neigh_name == selected_source) & (data.dst_neigh_name == selected_dest) ]
    selected_data = selected_data.sort_values("date")
    
    fig1 = px.line(
    selected_data, x="date", y="mean_travel_time", text="day_period",
    error_y="standard_deviation_travel_time",
    title="Travel time from {} to {}".format(selected_source, selected_dest),
    template="none"
    )

    fig1.update_xaxes(title="Date")
    fig1.update_yaxes(title="Avg. travel time (seconds)")
    fig1.update_traces(mode="lines+markers", marker_size=10, line_width=3, error_y_color="gray", error_y_thickness=1, error_y_width=10)
    
    st.plotly_chart(fig1, use_container_width=True)
    

if __name__ == "__main__":
    main()


### Travel by Time and Day Period

### Map between source and destination