<table border="0" style="width:100%">
 <tr>
    <td>
        <img src="https://static-frm.ie.edu/university/wp-content/uploads/sites/6/2022/06/IE-University-logo.png" width=150>
     </td>
    <td><div style="font-family:'Courier New'">
            <div style="font-size:25px">
                <div style="text-align: right"> 
                    <b> MASTER IN BIG DATA</b>
                    <br>
                    Python for Data Analysis II
                    <br><br>
                    <em> Daniel Sierra Ramos </em>
                </div>
            </div>
        </div>
    </td>
 </tr>
</table>

In [22]:
import json
import zipfile
import geopandas

import numpy as np
import pandas as pd

import plotly.express as px

## **Streamlit** App with Uber data

Build a Streamlit app to represent some charts about using public Uber data. This data contains information about the average travel time of Uber rides between all neighborhoods in MAdrid city.

The app must have the following characteristics
 - Main frame must show 3 charts given a `SOURCE` and `DESTINATION`:
    1. Time series of the average travel time
    2. Bar chart of the average travel time for every weekday, and period of the day
    3. **(Optional)** Choropleth map of Madrid city, with the source and destination highlighted
 - A side bar containing two box selectors to select the `SOURCE` and `DESTINATION`
 - Every time a source or a destination is selected, the charts of the main frame should be updated accordingly.

 NOTE: I recommend you to build the charts first here in the notebook, and then copy the code to the Streamlit app.

### Load data

Use the following function to load the data. For the first two figures you will need just the `data` variable. For the choropleth map you will need the `codes` variable and the `geopandas` library to work with maps (this is optional). 

In [23]:
def read_and_preprocess_data():
    
    with zipfile.ZipFile('uber-data.zip') as zip:
        with zip.open('madrid-barrios-2020-1-All-DatesByHourBucketsAggregate.csv') as csv:
            data = pd.read_csv(csv)
        with zip.open('madrid_barrios.json') as geojson:
            codes = geopandas.read_file(geojson, encoding="utf-8")

    # change data types in codes because they are not the same as in data
    codes['GEOCODIGO'] = codes['GEOCODIGO'].astype(int)
    codes['MOVEMENT_ID'] = codes['MOVEMENT_ID'].astype(int)

    codes["DISPLAY_NAME"] = codes["DISPLAY_NAME"].str.split().str[1:].str.join(" ")

    # Merge the data with the codes (source)
    data = data.merge(codes[["GEOCODIGO","MOVEMENT_ID","DISPLAY_NAME"]], left_on="sourceid", right_on="MOVEMENT_ID", how="left")
    data = data.rename(columns={"GEOCODIGO":"src_neigh_code", "DISPLAY_NAME":"src_neigh_name"}).drop(columns=["MOVEMENT_ID"])

    data = data.merge(codes[["GEOCODIGO","MOVEMENT_ID","DISPLAY_NAME"]], left_on="dstid", right_on="MOVEMENT_ID", how="left")
    data = data.rename(columns={"GEOCODIGO":"dst_neigh_code", "DISPLAY_NAME":"dst_neigh_name"}).drop(columns=["MOVEMENT_ID"])

    # Create a new date column
    data["year"] = "2020"
    data["date"] = pd.to_datetime(data["day"].astype(str)+data["month"].astype(str)+data["year"].astype(str)+":"+data["start_hour"].astype(str), format="%d%m%Y:%H")

    # Create a new day_period column
    data["day_period"] = data.start_hour.astype(str) + "-" + data.end_hour.astype(str)
    data["day_of_week"] = data.date.dt.weekday
    data["day_of_week_str"] = data.date.dt.day_name()

    return data, codes

In [24]:
data, codes = read_and_preprocess_data()

In [25]:
data

Unnamed: 0,sourceid,dstid,month,day,start_hour,end_hour,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,src_neigh_code,src_neigh_name,dst_neigh_code,dst_neigh_name,year,date,day_period,day_of_week,day_of_week_str
0,75,12,1,25,10,16,838.25,324.30,776.22,1.49,79123,San Fermín,79026,Palos de Moguer,2020,2020-01-25 10:00:00,10-16,5,Saturday
1,20,19,1,23,7,10,567.20,278.07,504.47,1.63,79041,Recoletos,79036,Niño Jesús,2020,2020-01-23 07:00:00,7-10,3,Thursday
2,52,9,2,8,0,7,470.88,268.89,412.66,1.64,79091,Casa de Campo,79023,La Chopera,2020,2020-02-08 00:00:00,0-7,5,Saturday
3,20,19,2,13,7,10,658.88,569.66,524.05,1.82,79041,Recoletos,79036,Niño Jesús,2020,2020-02-13 07:00:00,7-10,3,Thursday
4,14,25,1,26,10,16,519.17,240.26,478.95,1.46,79031,Pacífico,79046,Castellana,2020,2020-01-26 10:00:00,10-16,6,Sunday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285180,67,77,3,19,10,16,363.96,340.48,290.16,1.82,79112,Opañel,79125,Moscardó,2020,2020-03-19 10:00:00,10-16,3,Thursday
2285181,124,14,2,20,10,16,749.20,221.83,721.61,1.30,79206,Rejas,79031,Pacífico,2020,2020-02-20 10:00:00,10-16,3,Thursday
2285182,102,97,2,23,10,16,107.33,101.04,82.18,1.93,79162,Piovera,79156,San Juan Bautista,2020,2020-02-23 10:00:00,10-16,6,Sunday
2285183,106,57,3,13,10,16,920.80,258.01,886.86,1.31,79166,Valdefuentes,79096,El Plantío,2020,2020-03-13 10:00:00,10-16,4,Friday


In [26]:
data["date"].value_counts()

2020-01-01 00:00:00    11797
2020-03-08 00:00:00     9730
2020-03-01 00:00:00     9660
2020-02-23 00:00:00     9328
2020-02-29 00:00:00     9209
                       ...  
2020-03-25 00:00:00      794
2020-03-23 00:00:00      732
2020-03-27 00:00:00      723
2020-03-28 07:00:00      717
2020-03-24 00:00:00      598
Name: date, Length: 455, dtype: int64

In [27]:
codes

Unnamed: 0,CODBDT,GEOCODIGO,MOVEMENT_ID,DISPLAY_NAME,geometry
0,860645,79011,1,Palacio,"POLYGON ((-3.70939 40.42246, -3.70924 40.42237..."
1,860646,79012,2,Embajadores,"POLYGON ((-3.70284 40.41392, -3.70271 40.41388..."
2,860647,79013,3,Cortes,"POLYGON ((-3.69696 40.41895, -3.69652 40.41872..."
3,860648,79014,4,Justicia,"POLYGON ((-3.69649 40.42787, -3.69644 40.42787..."
4,860649,79015,5,Universidad,"POLYGON ((-3.71224 40.43023, -3.71214 40.43022..."
...,...,...,...,...,...
126,860771,79211,127,Alameda de Osuna,"POLYGON ((-3.58142 40.46274, -3.58097 40.46232..."
127,860772,79212,128,Aeropuerto,"POLYGON ((-3.54210 40.49437, -3.54197 40.49428..."
128,860773,79213,129,Casco Histórico de Barajas,"POLYGON ((-3.57854 40.47920, -3.57794 40.47851..."
129,860774,79214,130,Timón,"POLYGON ((-3.56175 40.51116, -3.56167 40.51069..."


### Travel by Time and Day

In [31]:
data.sort_values(by='date', inplace = True)
fig = px.line(data, x='date', y="mean_travel_time")

In [41]:
data[:50]

Unnamed: 0,sourceid,dstid,month,day,start_hour,end_hour,mean_travel_time,standard_deviation_travel_time,geometric_mean_travel_time,geometric_standard_deviation_travel_time,src_neigh_code,src_neigh_name,dst_neigh_code,dst_neigh_name,year,date,day_period,day_of_week,day_of_week_str
339644,71,13,1,1,0,7,1094.62,490.03,986.01,1.59,79116,Buenavista,79027,Atocha,2020,2020-01-01,0-7,2,Wednesday
1872398,54,37,1,1,0,7,561.83,372.27,448.52,2.01,79093,Ciudad Universitaria,79066,Berruguete,2020,2020-01-01,0-7,2,Wednesday
352054,100,10,1,1,0,7,977.88,543.51,858.77,1.64,79159,Costillares,79024,Legazpi,2020,2020-01-01,0-7,2,Wednesday
1841841,32,26,1,1,0,7,530.04,348.95,418.65,2.16,79061,Bellas Vistas,79051,El Viso,2020,2020-01-01,0-7,2,Wednesday
1841842,30,46,1,1,0,7,696.1,409.41,594.05,1.77,79055,Nueva España,79083,Peña Grande,2020,2020-01-01,0-7,2,Wednesday
741590,24,6,1,1,0,7,1061.35,440.38,972.47,1.53,79045,Lista,79016,Sol,2020,2020-01-01,0-7,2,Wednesday
1841843,31,36,1,1,0,7,507.46,393.56,387.61,2.11,79056,Castilla,79065,Valdeacederas,2020,2020-01-01,0-7,2,Wednesday
1352305,4,70,1,1,0,7,1169.32,586.21,1074.46,1.46,79014,Justicia,79115,Puerta Bonita,2020,2020-01-01,0-7,2,Wednesday
1339326,4,65,1,1,0,7,1555.4,228.12,1538.58,1.16,79014,Justicia,79107,Las Águilas,2020,2020-01-01,0-7,2,Wednesday
1841848,33,16,1,1,0,7,1136.29,656.4,998.47,1.65,79062,Cuatro Caminos,79033,Estrella,2020,2020-01-01,0-7,2,Wednesday


### Bar chart of the average travel time for every weekday, and period of the day


In [39]:
fig2 = px.bar(data[:100000], x='day_period', y="mean_travel_time")

In [37]:
fig2.show()