In [1]:
from lib.data import raw

import pandas as pd
import numpy as np

import requests


In [2]:
import xmltodict
import datetime

def read_xml_from_url(xml_url: str) -> pd.DataFrame:

    xml_content = requests.get(xml_url).content

    xml_data = xmltodict.parse(xml_content)

    columns = ['id', 'intensity', 'occupation', 'congestion', 'datetime']

    datet = datetime.datetime.strptime(xml_data['pms']['fecha_hora'], '%d/%m/%Y %H:%M:%S')

    data = [[int(x['idelem']), 
                int(x['intensidad']), 
                    int(x['ocupacion']), 
                        int(x['carga']),
                            datet] for x in xml_data['pms']['pm'] if x['error'] == 'N']

    return pd.DataFrame(data, columns=columns)

xml_data = read_xml_from_url('https://datos.madrid.es/egob/catalogo/202087-0-trafico-intensidad.xml')

xml_data

Unnamed: 0,id,intensity,occupation,congestion,datetime
0,3409,551,5,22,2022-11-01 14:30:04
1,4739,169,1,12,2022-11-01 14:30:04
2,4740,90,3,9,2022-11-01 14:30:04
3,4741,180,0,12,2022-11-01 14:30:04
4,4742,270,0,9,2022-11-01 14:30:04
...,...,...,...,...,...
4137,3817,540,5,39,2022-11-01 14:30:04
4138,10660,240,2,15,2022-11-01 14:30:04
4139,10662,600,4,36,2022-11-01 14:30:04
4140,10661,1200,11,70,2022-11-01 14:30:04


In [3]:
traffic_stations = next(raw.load_dataset(raw.TRAFFIC_STATIONS, 'csv', 2022, verbose=True))

traffic_stations = traffic_stations[['id', 'longitud', 'latitud']][traffic_stations['tipo_elem'] == 'URB']

Downloading trafico_ubicacion_de_los_puntos_de_medida_del_trafico dataset...
  Found 48 files
    Dataframe size: (4663, 9)


In [4]:
traffic_stations

Unnamed: 0,id,longitud,latitud
0,3840,-3.688323,40.430502
1,3841,-3.687256,40.430524
2,3842,-3.691727,40.422132
3,3843,-3.691929,40.421433
4,3844,-3.688470,40.433782
...,...,...,...
4362,3577,-3.775800,40.399330
4363,5167,-3.775626,40.399044
4364,5164,-3.774217,40.396418
4365,5177,-3.772306,40.394035


In [5]:
['week', 'intensity', 'occupation', 'congestion', 'is_holiday', 'daypart']

['week', 'intensity', 'occupation', 'congestion', 'is_holiday', 'daypart']

In [6]:
from lib.data import clean
import numpy as np

prepared_data = xml_data.merge(traffic_stations, on='id', how='left').drop(columns=['id'])

prepared_data = prepared_data[~prepared_data['longitud'].isna()]

prepared_data['week'] = prepared_data['datetime'].dt.isocalendar().week

hours = {
    # diurno
    0: [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
    # nocturno
    1: [23, 0, 1, 2, 3, 4, 5, 6],
    # vespertino
    2: [19, 20, 21, 22]
}

# 0: D (diurno), 1: N (nocturno), 2: E (vespertino)
set_daypart = lambda x: next(dayp for dayp in hours if x in hours[dayp])

prepared_data['year'] = prepared_data['datetime'].dt.year
prepared_data['month'] = prepared_data['datetime'].dt.month
prepared_data['day'] = prepared_data['datetime'].dt.day
prepared_data['hour'] = prepared_data['datetime'].dt.hour

prepared_data = prepared_data.drop(columns=['datetime'])

prepared_data['daypart'] = np.vectorize(set_daypart)(prepared_data['hour'])

prepared_data = clean.add_holiday_days(prepared_data)

prepared_data

Unnamed: 0,intensity,occupation,congestion,longitud,latitud,is_holiday,week,year,month,day,hour,daypart
0,551,5,22,-3.754152,40.401447,1,44,2022,11,1,14,0
1,169,1,12,-3.753719,40.399708,1,44,2022,11,1,14,0
2,90,3,9,-3.745582,40.400560,1,44,2022,11,1,14,0
3,180,0,12,-3.743635,40.400829,1,44,2022,11,1,14,0
4,270,0,9,-3.744638,40.399902,1,44,2022,11,1,14,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3851,360,3,15,-3.716235,40.470204,1,44,2022,11,1,14,0
3852,240,3,9,-3.714517,40.470489,1,44,2022,11,1,14,0
3853,360,2,12,-3.714962,40.472434,1,44,2022,11,1,14,0
3854,120,2,9,-3.715607,40.471395,1,44,2022,11,1,14,0


In [38]:
from importlib import reload

reload(raw)

xml_data = next(raw.load_dataset('Tráfico. Datos del tráfico en tiempo real', 'xml', 2022, verbose=True))

xml_data

Downloading trafico_datos_del_trafico_en_tiempo_real dataset...
  Found 1 files
    Dataframe size: (4113, 5)


Unnamed: 0,id,intensity,occupation,congestion,datetime
0,3409,45,0,2,2022-11-01 17:45:05
1,4739,0,0,0,2022-11-01 17:45:05
2,4740,270,6,23,2022-11-01 17:45:05
3,4741,90,0,6,2022-11-01 17:45:05
4,4742,225,2,9,2022-11-01 17:45:05
...,...,...,...,...,...
4108,3817,360,3,26,2022-11-01 17:45:05
4109,10660,240,1,14,2022-11-01 17:45:05
4110,10662,60,0,3,2022-11-01 17:45:05
4111,10661,900,10,55,2022-11-01 17:45:05


In [11]:
import lib.app as app

prepared_data = app.prepare_data(xml_data)
prepared_data

Unnamed: 0,intensity,occupation,congestion,longitud,latitud,is_holiday,week,year,month,day,hour,daypart
0,551,5,22,-3.754152,40.401447,1,44,2022,11,1,14,0
1,169,1,12,-3.753719,40.399708,1,44,2022,11,1,14,0
2,90,3,9,-3.745582,40.400560,1,44,2022,11,1,14,0
3,180,0,12,-3.743635,40.400829,1,44,2022,11,1,14,0
4,270,0,9,-3.744638,40.399902,1,44,2022,11,1,14,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3851,360,3,15,-3.716235,40.470204,1,44,2022,11,1,14,0
3852,240,3,9,-3.714517,40.470489,1,44,2022,11,1,14,0
3853,360,2,12,-3.714962,40.472434,1,44,2022,11,1,14,0
3854,120,2,9,-3.715607,40.471395,1,44,2022,11,1,14,0


In [8]:
traffic_stations.to_csv('data/traffic_stations.csv', index=False)

In [19]:
from importlib import reload

reload(app)

app.get_model_prediction(prepared_data)

Unnamed: 0,longitud,latitud,avg_noise,p10,p90
0,-3.754152,40.401447,64.311729,67.488988,57.643055
1,-3.753719,40.399708,59.679416,62.862142,49.524812
2,-3.745582,40.400560,57.698979,60.702351,46.393349
3,-3.743635,40.400829,59.939120,63.310793,49.691294
4,-3.744638,40.399902,60.655212,64.070581,50.873679
...,...,...,...,...,...
3851,-3.716235,40.470204,61.909601,65.039200,53.703113
3852,-3.714517,40.470489,59.858061,62.830876,50.114417
3853,-3.714962,40.472434,61.924918,65.172354,53.478170
3854,-3.715607,40.471395,58.198526,61.138703,47.174087
