In [1]:
import uuid
import pandas as pd
from datetime import datetime
import mysql.connector

Utilizamos una clase contenida en un archivo python independiente (es más cómodo trabajar así), que hemos incluido más abajo

In [ ]:
class MySQLConnector:
    def __init__(self, host, user, password, database=None):
        self.host = host
        self.user = user
        self.password = password
        self.database = database
        self.connection = None
        self.cursor = None

    def connect(self):
        try:
            self.connection = mysql.connector.connect(
                host=self.host,
                user=self.user,
                password=self.password,
                database=self.database
            )
            self.cursor = self.connection.cursor()
            print("Connected to MySQL database.")
        except mysql.connector.Error as err:
            print("Error: ", err)

    def execute_query(self, sql_query):
        try:
            self.cursor.execute(sql_query)
            self.connection.commit()
            print("Query executed successfully.")
        except mysql.connector.Error as err:
            print("Error: ", err)

    def fetch_data(self, sql_query):
        self.cursor.execute(sql_query)
        return self.cursor.fetchall()

    def fetch_data_as_df(self, sql_query):
        # Ejecutar la consulta
        self.cursor.execute(sql_query)
        result = self.cursor.fetchall()

        # Obtener los nombres de las columnas de los resultados
        col_names = [desc[0] for desc in self.cursor.description]

        # Convertir los resultados en un DataFrame de Pandas
        df = pd.DataFrame(result, columns=col_names)
        return df

    def close(self):
        if self.cursor:
            self.cursor.close()
        if self.connection:
            self.connection.close()
            print("Connection closed.")


In [2]:
df_accidentes = pd.read_csv("Datasets/Data_Combinada.csv", low_memory=False)

In [2]:
connector = MySQLConnector('localhost', 'root', 'root', 'TNT')
connector.connect()

Connected to MySQL database.


In [3]:
df_accidentes.head(10)

Unnamed: 0,num_expediente,fecha,hora,localizacion,numero,cod_distrito,distrito,tipo_accidente,estado_meteorológico,tipo_vehiculo,...,rango_edad,sexo,cod_lesividad,lesividad,coordenada_x_utm,coordenada_y_utm,positiva_alcohol,positiva_droga,numero_pasajeros,fugado
0,2022S000001,01/01/2022,1:30:00,"AVDA. ALBUFERA, 19",19,13.0,PUENTE DE VALLECAS,Alcance,Despejado,Turismo,...,De 30 a 34 años,M,,,443359226,4472082272,0.0,0.0,2,No
1,2022S000001,01/01/2022,1:30:00,"AVDA. ALBUFERA, 19",19,13.0,PUENTE DE VALLECAS,Alcance,Despejado,Turismo,...,De 45 a 49 años,H,,,443359226,4472082272,0.0,0.0,2,No
2,2022S000002,01/01/2022,0:30:00,PLAZA. CANOVAS DEL CASTILLO / PASEO. PRADO,2,3.0,RETIRO,Colisión fronto-lateral,,Motocicleta hasta 125cc,...,De 30 a 34 años,H,,,441155351,4474129588,1.0,0.0,3,No
3,2022S000002,01/01/2022,0:30:00,PLAZA. CANOVAS DEL CASTILLO / PASEO. PRADO,2,3.0,RETIRO,Colisión fronto-lateral,,Motocicleta hasta 125cc,...,De 35 a 39 años,M,,,441155351,4474129588,0.0,0.0,3,No
4,2022S000002,01/01/2022,0:30:00,PLAZA. CANOVAS DEL CASTILLO / PASEO. PRADO,2,3.0,RETIRO,Colisión fronto-lateral,,Turismo,...,De 40 a 44 años,H,,,441155351,4474129588,0.0,0.0,3,No
5,2022S000003,01/01/2022,1:50:00,"CALL. SAN BERNARDO, 53",53,1.0,CENTRO,Atropello a persona,Despejado,Motocicleta > 125cc,...,Desconocido,H,,,439995351,4475212523,0.0,0.0,2,No
6,2022S000003,01/01/2022,1:50:00,"CALL. SAN BERNARDO, 53",53,1.0,CENTRO,Atropello a persona,Despejado,Motocicleta > 125cc,...,De 18 a 20 años,M,7.0,Asistencia sanitaria sólo en el lugar del acci...,439995351,4475212523,0.0,0.0,2,No
7,2022S000004,01/01/2022,3:40:00,CALL. ALCALA / PLAZA. EISENHOWER,728,20.0,SAN BLAS-CANILLEJAS,Choque contra obstáculo fijo,Despejado,Turismo,...,De 50 a 54 años,H,2.0,Ingreso inferior o igual a 24 horas,449693925,4477837552,0.0,0.0,2,No
8,2022S000004,01/01/2022,3:40:00,CALL. ALCALA / PLAZA. EISENHOWER,728,20.0,SAN BLAS-CANILLEJAS,Choque contra obstáculo fijo,Despejado,Turismo,...,De 55 a 59 años,M,3.0,Ingreso superior a 24 horas,449693925,4477837552,0.0,0.0,2,No
9,2022S000005,01/01/2022,4:30:00,"AUTOV. A-42, +00500E",+00500E,12.0,USERA,Choque contra obstáculo fijo,Despejado,Turismo,...,De 45 a 49 años,H,,,438952303,4469985021,1.0,0.0,2,No


# Esquema de BBDD

create table Distritos
(
    codigo_distrito int         not null
        primary key,
    nombre_distrito varchar(50) null
);

create table Fecha
(
    fecha   date not null,
    season  char null,
    weather char not null,
    primary key (fecha, weather)
);

create table Accidentes
(
    num_expediente   varchar(12)  not null
        primary key,
    fecha            date         null,
    hora             time         null,
    calle            varchar(255) null,
    numero           varchar(8)   null,
    codigo_distrito  int          null,
    tipo_accidente   varchar(50)  null,
    coordenada_x_utm double       null,
    coordenada_y_utm double       null,
    positivo_alcohol tinyint(1)   null,
    positivo_droga   tinyint(1)   null,
    fugado           tinyint(1)   null,
    constraint Accidentes_Fecha_fecha_fk
        foreign key (fecha) references Fecha (fecha),
    constraint Accidentes_ibfk_1
        foreign key (codigo_distrito) references Distritos (codigo_distrito)
);

create index codigo_distrito
    on Accidentes (codigo_distrito);

create table Luminarias
(
    uuid           char(36)     not null
        primary key,
    localizacion   varchar(255) null,
    cod_distrito   int          null,
    tipo           char         null,
    num_luminarias int          null,
    constraint Luminarias_ibfk_1
        foreign key (cod_distrito) references Distritos (codigo_distrito)
);

create index cod_distrito
    on Luminarias (cod_distrito);

create table PersonasInvolucradas
(
    uuid             char(36)    not null
        primary key,
    num_expediente   varchar(12) null,
    tipo_vehiculo    varchar(50) null,
    tipo_persona     varchar(50) null,
    rango_edad       varchar(50) null,
    sexo             char        null,
    codigo_lesividad int         null,
    lesividad        varchar(50) null,
    numero_pasajeros int         null,
    constraint PersonasInvolucradas_ibfk_1
        foreign key (num_expediente) references Accidentes (num_expediente)
);

create table RadaresFijos
(
    id                int auto_increment
        primary key,
    ubicacion         varchar(255) null,
    carretera         varchar(255) null,
    m30               varchar(255) null,
    punto_kilometrico varchar(255) null,
    sentido           varchar(255) null,
    tipo              varchar(255) null,
    longitud          float        null,
    latitud           float        null,
    coordenadas       varchar(255) null,
    cod_distrito      int          null,
    constraint RadaresFijos_Distritos_codigo_distrito_fk
        foreign key (cod_distrito) references Distritos (codigo_distrito)
);

create table Vehicles
(
    id                 char(36)    not null
        primary key,
    owner_type         char        null,
    district_code      int         null,
    vehicle_type       char(2)     null,
    sticker            char        null,
    fuel_type          varchar(30) null,
    year_matriculation int         null,
    num_ocurrence      int         null,
    constraint Vehicles_ibfk_1
        foreign key (district_code) references Distritos (codigo_distrito)
);

create index district_code
    on Vehicles (district_code);



# Codificación de los accidentes

Calculamos el máximo para dimensionar correctamente la base de datos

In [4]:
df_accidentes['num_expediente'].max()

'2023S029136'

Se rellenan algunos valores nulos y se formatea correctamente los datos previa inserción

In [11]:
df_accidentes = df_accidentes.sort_values(by=['fecha', 'hora']).ffill()
df_accidentes = df_accidentes.sort_values(by=['fecha', 'hora']).bfill()
df_accidentes['fugado'].replace({'No': 0, 'Si': 1}, inplace=True)
df_accidentes['numero'].fillna('S/N', inplace=True)
df_accidentes['tipo_vehiculo'].fillna('Desconocido', inplace=True)
df_accidentes['lesividad'].fillna('Sin asistencia - Desconocido', inplace=True)
df_accidentes['cod_lesividad'].fillna(0, inplace=True)

In [42]:
fechas = df_accidentes.drop_duplicates(subset=['fecha', 'estado_meteorológico'])

In [43]:
fechas = fechas.filter(items=['fecha', 'estado_meteorológico'])

In [45]:
fechas.reset_index().pop("index").head(0)

Series([], Name: index, dtype: int64)

In [58]:
fechas['estado_meteorológico'] = fechas['estado_meteorológico'].replace({'Despejado':'D', 'Nublado':'n','Lluvia débil':'l', 'LLuvia intensa':'L', 'Granizando':'G', 'Nevando':'N', 'Se desconoce':'S'})

In [74]:
fechas = fechas.drop_duplicates()

Metemos las fechas, que es una tabla que no tiene más FK referenciando a otras tablas. Para ellos, extraemos todas las fechas que existen en la base de datos

Verificamos si existen fechas con algún formato incorrecto

In [17]:
patron = r'\d{2}/\d{2}/\d{4}'

filas_con_formato_incorrecto = df_accidentes[~df_accidentes['fecha'].str.match(patron)]

In [18]:
filas_con_formato_incorrecto

Unnamed: 0,num_expediente,fecha,hora,localizacion,numero,cod_distrito,distrito,tipo_accidente,estado_meteorológico,tipo_vehiculo,...,rango_edad,sexo,cod_lesividad,lesividad,coordenada_x_utm,coordenada_y_utm,positiva_alcohol,positiva_droga,numero_pasajeros,fugado


In [76]:
for index, row in fechas.iterrows():
    fechas_query = f"""INSERT INTO Fecha(fecha, weather) VALUES ('{datetime.strptime(row[0], "%d/%m/%Y")}', '{row[1]}')"""
    print(fechas_query)
    connector.execute_query(fechas_query)

  fechas_query = f"""INSERT INTO Fecha(fecha, weather) VALUES ('{datetime.strptime(row[0], "%d/%m/%Y")}', '{row[1]}')"""


INSERT INTO Fecha(fecha, weather) VALUES ('2022-01-01 00:00:00', 'D')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2023-01-01 00:00:00', 'n')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2023-01-01 00:00:00', 'D')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2023-01-01 00:00:00', 'l')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2022-02-01 00:00:00', 'D')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2023-02-01 00:00:00', 'D')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2022-03-01 00:00:00', 'D')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2023-03-01 00:00:00', 'D')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2023-03-01 00:00:00', 'n')
Query executed successfully.
INSERT INTO Fecha(fecha, weather) VALUES ('2022-04-01 00:00:00', 'D')
Query executed successfully.
INSERT INT

Estraemos todos los distritos (únicos) y los insertamos usando su cod_distrito como PK

In [47]:
cod_distrito_unique = df_accidentes['cod_distrito'].unique()
distrito_unique = df_accidentes['distrito'].unique()
emparejados = {cod: dist for cod, dist in zip(cod_distrito_unique, distrito_unique)}

In [7]:
for cod, dist in emparejados.items():
    distrito_query = f"""
        INSERT INTO Distritos (codigo_distrito, nombre_distrito) VALUES ('{cod}', '{dist}')
        """

    connector.execute_query(distrito_query)

distrito_query = f"""
        INSERT INTO Distritos (codigo_distrito, nombre_distrito) VALUES ('0', 'DESCONOCIDO')
        """
connector.execute_query(distrito_query)

Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.


Formateamos correctamente la información de las localizacions para que no tenga carácteres prohibidos

In [15]:
df_accidentes['localizacion'] = df_accidentes['localizacion'].str.replace("/", '-').str.replace("'", "")
df_accidentes['coordenada_y_utm'] = df_accidentes['coordenada_y_utm'].str.replace(",", '.')
df_accidentes['coordenada_x_utm'] = df_accidentes['coordenada_x_utm'].str.replace(",", '.')

In [16]:
accidentes = df_accidentes.filter(
    items=['num_expediente', 'fecha', 'hora', 'localizacion', 'numero', 'cod_distrito', 'tipo_accidente',
           'estado_meteorológico', 'coordenada_x_utm', 'coordenada_y_utm', 'positiva_alcohol', 'positiva_droga',
           'fugado'])
accidentes.head()

Unnamed: 0,num_expediente,fecha,hora,localizacion,numero,cod_distrito,tipo_accidente,estado_meteorológico,coordenada_x_utm,coordenada_y_utm,positiva_alcohol,positiva_droga,fugado
13,2022S000008,01/01/2022,0:25:00,"PLAZA. ENCUENTRO, 2",2,14.0,Choque contra obstáculo fijo,Despejado,444759.786,4472970.631,0.0,0.0,0
2,2022S000002,01/01/2022,0:30:00,PLAZA. CANOVAS DEL CASTILLO - PASEO. PRADO,2,3.0,Colisión fronto-lateral,Despejado,441155.351,4474129.588,1.0,0.0,0
3,2022S000002,01/01/2022,0:30:00,PLAZA. CANOVAS DEL CASTILLO - PASEO. PRADO,2,3.0,Colisión fronto-lateral,Despejado,441155.351,4474129.588,0.0,0.0,0
4,2022S000002,01/01/2022,0:30:00,PLAZA. CANOVAS DEL CASTILLO - PASEO. PRADO,2,3.0,Colisión fronto-lateral,Despejado,441155.351,4474129.588,0.0,0.0,0
165,2022S000088,01/01/2022,11:45:00,AVDA. VALLADOLID - PLAZA. REPUBLICA DE CHILE,10,9.0,Colisión lateral,Despejado,437731.276,4476176.535,0.0,0.0,0


Codificamos el accidente como un suceso único, con varios implicados

In [19]:
accidentes = accidentes.drop_duplicates(subset=['num_expediente'])

In [20]:
i = 0
for index, row in accidentes.iterrows():
    i = i + 1
    try:
        sql = f"""
        INSERT INTO Accidentes (num_expediente, fecha, hora, calle, numero, codigo_distrito, tipo_accidente, estado_meteorologico, coordenada_x_utm, coordenada_y_utm, positivo_alcohol, positivo_droga, fugado)
        VALUES ('{row['num_expediente']}', '{datetime.strptime(row['fecha'], "%d/%m/%Y")}', '{datetime.strptime(row['hora'], "%H:%M:%S")}', '{row['localizacion']}', '{row['numero']}', {row['cod_distrito']}, '{row['tipo_accidente']}', '{row['estado_meteorológico']}', '{float(row['coordenada_x_utm'])}', '{float(row['coordenada_y_utm'])}', {row['positiva_alcohol']}, {row['positiva_droga']}, {row['fugado']});
    """
        print(f"Current row index: {index}, count: {i}")
        connector.execute_query(sql)
    except Exception as e:
        print(e)

Current row index: 13, count: 1
Query executed successfully.
Current row index: 2, count: 2
Query executed successfully.
Current row index: 165, count: 3
Query executed successfully.
Current row index: 74, count: 4
Query executed successfully.
Current row index: 70, count: 5
Query executed successfully.
Current row index: 72, count: 6
Query executed successfully.
Current row index: 33, count: 7
Query executed successfully.
Current row index: 37, count: 8
Query executed successfully.
Current row index: 45, count: 9
Query executed successfully.
Current row index: 43, count: 10
Query executed successfully.
Current row index: 41, count: 11
Query executed successfully.
Current row index: 48, count: 12
Query executed successfully.
Current row index: 16, count: 13
Query executed successfully.
Current row index: 51, count: 14
Query executed successfully.
Current row index: 49, count: 15
Query executed successfully.
Current row index: 0, count: 16
Query executed successfully.
Current row index:

Codificamos las personas implicadas en los accidentes y lo relacionamos con el suceso del accidente

In [63]:
personas = df_accidentes.filter(items=['num_expediente', 'tipo_vehiculo', 'tipo_persona', 'rango_edad', 'sexo', 'cod_lesividad', 'numero_pasajeros'])

In [65]:
personas.sort_values(by=['num_expediente'])

Unnamed: 0,num_expediente,tipo_vehiculo,tipo_persona,rango_edad,sexo,cod_lesividad,numero_pasajeros
0,2022S000001,Turismo,Conductor,De 30 a 34 años,M,1.0,2
1,2022S000001,Turismo,Conductor,De 45 a 49 años,H,1.0,2
2,2022S000002,Motocicleta hasta 125cc,Conductor,De 30 a 34 años,H,14.0,3
3,2022S000002,Motocicleta hasta 125cc,Pasajero,De 35 a 39 años,M,14.0,3
4,2022S000002,Turismo,Conductor,De 40 a 44 años,H,14.0,3
...,...,...,...,...,...,...,...
74954,2023S028337,Turismo,Conductor,De 25 a 29 años,H,14.0,3
74958,2023S028341,Turismo,Conductor,De 35 a 39 años,H,14.0,2
74957,2023S028341,Motocicleta hasta 125cc,Conductor,De 30 a 34 años,H,7.0,2
74959,2023S028352,Motocicleta hasta 125cc,Conductor,De 40 a 44 años,H,1.0,1


In [71]:
personas['num_ocurrence'] = 0
personas.groupby(['num_expediente', 'tipo_vehiculo','tipo_persona','rango_edad','sexo','cod_lesividad','numero_pasajeros']).count().reset_index()

Unnamed: 0,num_expediente,tipo_vehiculo,tipo_persona,rango_edad,sexo,cod_lesividad,numero_pasajeros,num_ocurrence
0,2022S000001,Turismo,Conductor,De 30 a 34 años,M,1.0,2,1
1,2022S000001,Turismo,Conductor,De 45 a 49 años,H,1.0,2,1
2,2022S000002,Motocicleta hasta 125cc,Conductor,De 30 a 34 años,H,14.0,3,1
3,2022S000002,Motocicleta hasta 125cc,Pasajero,De 35 a 39 años,M,14.0,3,1
4,2022S000002,Turismo,Conductor,De 40 a 44 años,H,14.0,3,1
...,...,...,...,...,...,...,...,...
74810,2023S028337,Turismo,Pasajero,De 55 a 59 años,M,14.0,3,1
74811,2023S028341,Motocicleta hasta 125cc,Conductor,De 30 a 34 años,H,7.0,2,1
74812,2023S028341,Turismo,Conductor,De 35 a 39 años,H,14.0,2,1
74813,2023S028352,Motocicleta hasta 125cc,Conductor,De 40 a 44 años,H,1.0,1,1


In [72]:
for index, row in personas.iterrows():
    personas_sql = f"""INSERT INTO PersonasInvolucradas(uuid, num_expediente, tipo_vehiculo,tipo_persona,rango_edad,sexo,codigo_lesividad,numero_pasajeros) values ('{uuid.uuid4()}','{row['num_expediente']}', '{row['tipo_vehiculo']}', '{row['tipo_persona']}', '{row['rango_edad']}', '{row['sexo']}', {row['cod_lesividad']}, {row['numero_pasajeros']})"""
    connector.execute_query(personas_sql)

Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed

# Codificación padrón de vehículos

In [114]:
# noinspection SpellCheckingInspection
df_padron = pd.read_csv("Datasets/padron22.csv", low_memory=False, encoding='UTF-8')

In [115]:
df_padron.head(20)

Unnamed: 0,Ï»¿EJERICIO,COD_TIPO_PERSONA,TIPO_PERSONA,COD_DISTRITO,DISTRITO,COD_BARRIO,BARRIO,COD_TIPO_VEHICULO,TIPO_VEHICULO,ETIQUETA_MEDIOAMBIENTAL,CLASIFICACIÃN_AMBIENTAL,CUOTA,TIPO_CARBURANTE,AÃO_MATRICULACIÃN,CONTADOR
0,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,--,Sin clasificaciÃ³n ambiental,73.0,Tipo carburante desconocido,2013,1
1,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2006,1
2,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2007,1
3,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2009,1
4,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2015,1
5,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,--,Sin clasificaciÃ³n ambiental,149.0,DIESEL,2010,1
6,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,B,Etiqueta B,73.0,DIESEL,2015,1
7,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,B,Etiqueta B,149.0,DIESEL,2014,1
8,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,C,Etiqueta C,73.0,DIESEL,2018,2
9,2022,E,Ente sin Personalidad Art 35.4,0,--,0,--,CA,CAMION,C,Etiqueta C,149.0,DIESEL,2019,3


Tiramos las columnas que no podemos utilizar para cruzar con accidentalidad. Así como los datos repetidos

In [116]:
df_padron.drop(columns=['Ï»¿EJERICIO', 'DISTRITO', 'BARRIO', 'COD_BARRIO', 'CONTADOR'], inplace=True)

Arreglamos los códigos para que tengan sentido

In [117]:
cod_veh = df_padron['COD_TIPO_VEHICULO'].unique()
tipo_veh = df_padron['TIPO_VEHICULO'].unique()

In [118]:
cod_veh

array(['CA', 'MT', 'RE', 'TR', 'TU', 'CI', 'AU', 'SE'], dtype=object)

In [119]:
tipo_veh

array(['CAMION', 'MOTOCICLETA', 'OTROS', 'TURISMO'], dtype=object)

In [120]:
for row in zip(cod_veh, tipo_veh):
    print(row)

('CA', 'CAMION')
('MT', 'MOTOCICLETA')
('RE', 'OTROS')
('TR', 'TURISMO')


In [121]:
df_padron['COD_TIPO_VEHICULO'] = df_padron['COD_TIPO_VEHICULO'].replace({'CA': 'C', 'MT': 'M', 'RE': 'O', 'TR': 'T', 'TU': 'T'})

In [122]:
cod_veh = df_padron['COD_TIPO_VEHICULO'].unique()
tipo_veh = df_padron['TIPO_VEHICULO'].unique()
for row in zip(cod_veh, tipo_veh):
    print(row)

('C', 'CAMION')
('M', 'MOTOCICLETA')
('O', 'OTROS')
('T', 'TURISMO')


In [123]:
df_padron.pop('TIPO_VEHICULO').head()

0    CAMION
1    CAMION
2    CAMION
3    CAMION
4    CAMION
Name: TIPO_VEHICULO, dtype: object

In [124]:
df_padron.head()

Unnamed: 0,COD_TIPO_PERSONA,TIPO_PERSONA,COD_DISTRITO,COD_TIPO_VEHICULO,ETIQUETA_MEDIOAMBIENTAL,CLASIFICACIÃN_AMBIENTAL,CUOTA,TIPO_CARBURANTE,AÃO_MATRICULACIÃN
0,E,Ente sin Personalidad Art 35.4,0,C,--,Sin clasificaciÃ³n ambiental,73.0,Tipo carburante desconocido,2013
1,E,Ente sin Personalidad Art 35.4,0,C,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2006
2,E,Ente sin Personalidad Art 35.4,0,C,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2007
3,E,Ente sin Personalidad Art 35.4,0,C,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2009
4,E,Ente sin Personalidad Art 35.4,0,C,--,Sin clasificaciÃ³n ambiental,73.0,DIESEL,2015


Codificamos las etiquetas medioambientales, no tiene sentido guardar tanto el código como su descripción

In [125]:
cod_veh = df_padron['ETIQUETA_MEDIOAMBIENTAL'].unique()
tipo_veh = df_padron['CLASIFICACIÃ\x93N_AMBIENTAL'].unique()
for row in zip(cod_veh, tipo_veh):
    print(row)

('--', 'Sin clasificaciÃ³n ambiental')
('B', 'Etiqueta B')
('C', 'Etiqueta C')
(nan, 'Sin distintivo Ambiental')
('0', 'Cero Emisiones')
('E', 'ECO')


In [126]:
df_padron['ETIQUETA_MEDIOAMBIENTAL'] = df_padron['ETIQUETA_MEDIOAMBIENTAL'].replace({'--': 'A'})
df_padron['ETIQUETA_MEDIOAMBIENTAL'] = df_padron['ETIQUETA_MEDIOAMBIENTAL'].fillna('A')
df_padron.pop('CLASIFICACIÃ\x93N_AMBIENTAL').head()

0    Sin clasificaciÃ³n ambiental
1    Sin clasificaciÃ³n ambiental
2    Sin clasificaciÃ³n ambiental
3    Sin clasificaciÃ³n ambiental
4    Sin clasificaciÃ³n ambiental
Name: CLASIFICACIÃN_AMBIENTAL, dtype: object

Codificamos el tipo de persona

In [127]:
df_padron['COD_TIPO_PERSONA'].unique()

array(['E', 'F', 'J', 'O'], dtype=object)

In [128]:
df_padron['COD_TIPO_PERSONA'] = df_padron['COD_TIPO_PERSONA'].replace({'E': 'S'})

Solo existe una fila con la O y no tiene una clasificación clara así que lo he tirado

In [129]:
df_padron = df_padron.drop(df_padron[df_padron['COD_TIPO_PERSONA'] == 'O'].index)

In [130]:
df_padron.pop('TIPO_PERSONA').head()

0    Ente sin Personalidad Art 35.4
1    Ente sin Personalidad Art 35.4
2    Ente sin Personalidad Art 35.4
3    Ente sin Personalidad Art 35.4
4    Ente sin Personalidad Art 35.4
Name: TIPO_PERSONA, dtype: object

Vemos que existe un código distrito 0 que en la documentación aparece como desconocido. Por lo que se codifica como tal también la base de datos

In [131]:
df_padron[df_padron['COD_DISTRITO'] == 0]

Unnamed: 0,COD_TIPO_PERSONA,COD_DISTRITO,COD_TIPO_VEHICULO,ETIQUETA_MEDIOAMBIENTAL,CUOTA,TIPO_CARBURANTE,AÃO_MATRICULACIÃN
0,S,0,C,A,73.0,Tipo carburante desconocido,2013
1,S,0,C,A,73.0,DIESEL,2006
2,S,0,C,A,73.0,DIESEL,2007
3,S,0,C,A,73.0,DIESEL,2009
4,S,0,C,A,73.0,DIESEL,2015
...,...,...,...,...,...,...,...
156973,J,0,T,A,224.0,GASOLINA,1999
156974,J,0,T,A,224.0,GASOLINA,2005
156975,J,0,T,A,224.0,GASOLINA,2007
156976,J,0,T,A,224.0,GASOLINA,2018


Arreglamos la información del tipo de carburante

In [132]:
df_padron['TIPO_CARBURANTE'].unique()

array(['Tipo carburante desconocido', 'DIESEL', 'GASOLINA', 'OTROS',
       'HIBRIDO ENCHUFABLES PHEV', 'HÃ\x8dBRIDO',
       'GAS LICUADO DE PETROLEO', 'GAS NATURAL COMPRIMIDO',
       'ELÃ\x89CTRICO', 'ELECTRICOS RANGO EXTENDIDO',
       'GAS NATURAL LICUADO', 'ETANOL', 'BIOMETANO', 'BIODIESEL'],
      dtype=object)

In [133]:
df_padron['TIPO_CARBURANTE'] = df_padron['TIPO_CARBURANTE'].replace(
    {'HÃBRIDO': 'HIBRIDO', 'ELÃCTRICO': 'ELECTRICO', 'Tipo carburante desconocido': 'DESCONOCIDO'})

In [134]:
df_padron.rename(columns={'AÃ\x91O_MATRICULACIÃ\x93N': 'Y_MATRICULACION'}, inplace=True)

In [135]:
df_padron.head()

Unnamed: 0,COD_TIPO_PERSONA,COD_DISTRITO,COD_TIPO_VEHICULO,ETIQUETA_MEDIOAMBIENTAL,CUOTA,TIPO_CARBURANTE,Y_MATRICULACION
0,S,0,C,A,73.0,DESCONOCIDO,2013
1,S,0,C,A,73.0,DIESEL,2006
2,S,0,C,A,73.0,DIESEL,2007
3,S,0,C,A,73.0,DIESEL,2009
4,S,0,C,A,73.0,DIESEL,2015
...,...,...,...,...,...,...,...
180340,J,21,T,A,129.0,DIESEL,2001
180341,J,21,T,A,129.0,DIESEL,2003
180342,J,21,T,A,129.0,DIESEL,2005
180343,J,21,T,A,129.0,GASOLINA,2000


Vamos a clusterizar los datos ya que hay muchos repetidos

In [136]:
df_padron.pop('CUOTA').head()

0    73.0
1    73.0
2    73.0
3    73.0
4    73.0
Name: CUOTA, dtype: float64

In [137]:
df_padron['OCURRENCIAS'] = 0

In [138]:
vehicles = df_padron.groupby(
    ['COD_TIPO_PERSONA', 'COD_DISTRITO', 'COD_TIPO_VEHICULO', 'ETIQUETA_MEDIOAMBIENTAL', 'TIPO_CARBURANTE',
     'Y_MATRICULACION']).count()

In [139]:
vehicles.reset_index(inplace=True)

In [140]:
vehicles['Y_MATRICULACION'] = vehicles['Y_MATRICULACION'].replace({'Desconocido':-1})

In [141]:
vehicles.head()

Unnamed: 0,COD_TIPO_PERSONA,COD_DISTRITO,COD_TIPO_VEHICULO,ETIQUETA_MEDIOAMBIENTAL,TIPO_CARBURANTE,Y_MATRICULACION,OCURRENCIAS
0,F,0,AU,A,DESCONOCIDO,-1,2
1,F,0,AU,A,DIESEL,1993,1
2,F,0,AU,A,DIESEL,2005,1
3,F,0,AU,B,DIESEL,2008,1
4,F,0,AU,B,DIESEL,2009,1


Procedemos a hacer todas las inserciones

In [143]:
import uuid
i = 0
for index, row in vehicles.iterrows():
    i = i + 1
    try:
        id = uuid.uuid4()
        owner_type = row['COD_TIPO_PERSONA']
        district_code = row['COD_DISTRITO']
        vehicle_type = row['COD_TIPO_VEHICULO']
        sticker = row['ETIQUETA_MEDIOAMBIENTAL']
        fuel_type = row['TIPO_CARBURANTE']
        year_matriculation = row['Y_MATRICULACION']
        num_ocurrence = row['OCURRENCIAS']
    
        # Construir la sentencia SQL INSERT
        insert_query = f"INSERT INTO Vehicles (id, owner_type, district_code, vehicle_type, sticker, fuel_type, year_matriculation, num_ocurrence) VALUES ('{id}', '{owner_type}', {district_code}, '{vehicle_type}', '{sticker}', '{fuel_type}', {year_matriculation}, {num_ocurrence});"
        connector.execute_query(insert_query)
        
        print(f"Inserting row: {index}, item: {i}")
    except Exception as e:
        print(e)


Query executed successfully.
Inserting row: 0, item: 1
Query executed successfully.
Inserting row: 1, item: 2
Query executed successfully.
Inserting row: 2, item: 3
Query executed successfully.
Inserting row: 3, item: 4
Query executed successfully.
Inserting row: 4, item: 5
Query executed successfully.
Inserting row: 5, item: 6
Query executed successfully.
Inserting row: 6, item: 7
Query executed successfully.
Inserting row: 7, item: 8
Query executed successfully.
Inserting row: 8, item: 9
Query executed successfully.
Inserting row: 9, item: 10
Query executed successfully.
Inserting row: 10, item: 11
Query executed successfully.
Inserting row: 11, item: 12
Query executed successfully.
Inserting row: 12, item: 13
Query executed successfully.
Inserting row: 13, item: 14
Query executed successfully.
Inserting row: 14, item: 15
Query executed successfully.
Inserting row: 15, item: 16
Query executed successfully.
Inserting row: 16, item: 17
Query executed successfully.
Inserting row: 17, it

## Leemos el dataset de paradas de metro

In [2]:
df_stops = pd.read_csv('./Datasets/stopsmetro.csv')
df_stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,par_4_284,284,HOSPITAL INFANTA SOFIA,Paseo Europa 11,40.55977,-3.61145,B1,http://www.crtm.es,0,,Europe/Madrid,2
1,par_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,http://www.crtm.es,0,,Europe/Madrid,2
2,acc_4_279_684,279,Ascensor,Avda de la Ermita 5,40.53199,-3.63548,,http://www.crtm.es,2,est_4_279,,0
3,par_4_283,283,REYES CATOLICOS,Avda de la Plaza de Toros 7,40.55037,-3.6234,B1,http://www.crtm.es,0,,Europe/Madrid,2
4,par_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,http://www.crtm.es,0,,Europe/Madrid,2


 nos quitamos las columnas no deseadas

In [3]:
df_stops.drop(columns=['stop_timezone', 'wheelchair_boarding', 'stop_url', ' location_type'], inplace=True)

KeyError: "['stop_timezone', 'wheelchair_boarding', 'stop_url'] not found in axis"

Como está dando un error extraño, inspeccionamos las columnas para saber cual es el problema

In [4]:
# Inspeccionar los nombres de las columnas para identificar espacios adicionales o caracteres inusuales
column_names = [col.strip() for col in df_stops.columns]

# Mostrar los nombres de las columnas después de eliminar los espacios adicionales
column_names

['stop_id',
 'stop_code',
 'stop_name',
 'stop_desc',
 'stop_lat',
 'stop_lon',
 'zone_id',
 'stop_url',
 'location_type',
 'parent_station',
 'stop_timezone',
 'wheelchair_boarding']

Sustituimos los nombres para que coincidan y nos quitamos las columnas no deseadas

In [5]:
# Eliminar específicamente las columnas 'stop_timezone' y 'wheelchair_boarding' después de ajustar los nombres de las columnas
df_stops.columns = column_names
df_stops = df_stops.drop(['stop_timezone', 'wheelchair_boarding', 'stop_url', 'location_type'], axis=1)

# Mostrar las primeras filas del DataFrame corregido para confirmar la eliminación de las columnas
df_stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
0,par_4_284,284,HOSPITAL INFANTA SOFIA,Paseo Europa 11,40.55977,-3.61145,B1,
1,par_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
2,acc_4_279_684,279,Ascensor,Avda de la Ermita 5,40.53199,-3.63548,,est_4_279
3,par_4_283,283,REYES CATOLICOS,Avda de la Plaza de Toros 7,40.55037,-3.6234,B1,
4,par_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,


Vamos a realizar la conexion a la BBDD y ejecutar el query para montar la estructura relacional de la misma con tablas dcon la siguiente información:
1. **Tabla Stations**
Columnas:
station_id (VARCHAR, PRIMARY KEY): Basado en stop_id.
station_code (INT): Basado en stop_code.
name (VARCHAR): Basado en stop_name.
description (VARCHAR): Basado en stop_desc.
latitude (FLOAT): Basado en stop_lat.
longitude (FLOAT): Basado en stop_lon.
zone_id (VARCHAR): Basado en zone_id.

2. **Tabla LocationTypes**
Columnas:
location_type_id (INT, PRIMARY KEY)
description (VARCHAR)

3. **Tabla StationLocationTypes**
Columnas:
station_id (VARCHAR): Clave foránea de Stations.
location_type_id (INT): Clave foránea de LocationTypes.
4. **Tabla ParentStations**
Columnas:
parent_station_id (VARCHAR, PRIMARY KEY)
station_id (VARCHAR): Clave foránea de Stations.

In [34]:
# Lista de consultas SQL para crear las tablas
queries = [
    """
    CREATE TABLE `Parent_Stations` (
    `station_id` VARCHAR(255) PRIMARY KEY,
    `name` VARCHAR(255),
    `description` VARCHAR(255),
    `latitude` DOUBLE,
    `longitude` DOUBLE,
    `zone_id` VARCHAR(255)
);

"""
    ,
    """
    CREATE TABLE `Stops` (
    `stop_id` VARCHAR(255) PRIMARY KEY,
    `stop_code` INT,
    `name` VARCHAR(255),
    `description` VARCHAR(255),
    `latitude` DOUBLE,
    `longitude` DOUBLE,
    `zone_id` VARCHAR(255),
    `parent_station_id` VARCHAR(255),
    FOREIGN KEY (`parent_station_id`) REFERENCES `Parent_Stations`(`station_id`)
);
    """
]

# Ejecutar cada consulta para crear las tablas
for query in queries:
    connector.execute_query(query)

Connected to MySQL database.
Query executed successfully.
Query executed successfully.


Extraemos valores únicos de parent_station (excluyendo valores nulos o vacíos)

In [35]:
unique_parent_stations = df_stops[df_stops['parent_station'].notna()]['parent_station'].unique()
unique_parent_stations = pd.DataFrame(unique_parent_stations)
unique_parent_stations.rename(columns={0:'station_id'}, inplace=True)
unique_parent_stations

Unnamed: 0,station_id
0,est_4_279
1,est_4_57
2,est_4_281
3,est_4_280
4,est_90_18
5,est_4_278
6,est_4_142
7,est_90_25
8,est_4_50
9,est_90_71


Extraer información completa para las estaciones padre

In [36]:
# Extraer información completa para las estaciones parentales
parent_stations_info = df_stops[df_stops['stop_id'].isin(unique_parent_stations['station_id'])]
parent_stations_info

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
5,est_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
6,est_4_278,278,LA GRANJA,Calle de Sepúlveda 1,40.5276,-3.65859,B1,
17,est_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,
22,est_4_281,281,MANUEL DE FALLA,Calle Manuel de Falla 59,40.55048,-3.64688,B1,


Vemos que solo recibimos 4 entradas de estaciones padre que existen en el csv, pero hay más referenciadas. Procederemos a agregarlas a la tabla de estaciones padre para no perder la integridad de los datos y que estén relacionadas con sus estaciones padre

In [37]:
for station_id in unique_parent_stations['station_id']:
    if station_id not in parent_stations_info['stop_id'].values:
        # Crea un registro nuevo con valores nulos o predeterminados
        new_row = {'stop_id': station_id, 'stop_name': 'Nombre desconocido', 'stop_desc': 'Descripción no disponible',
                   'stop_lat': 0.0, 'stop_lon': 0.0, 'zone_id': 'Desconocido'}
        # Añade este nuevo registro a parent_stations_info
        parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
parent_stations_info

  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
0,est_4_279,279.0,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
1,est_4_278,278.0,LA GRANJA,Calle de Sepúlveda 1,40.5276,-3.65859,B1,
2,est_4_280,280.0,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,
3,est_4_281,281.0,MANUEL DE FALLA,Calle Manuel de Falla 59,40.55048,-3.64688,B1,
4,est_4_57,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
5,est_90_18,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
6,est_4_142,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
7,est_90_25,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
8,est_4_50,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
9,est_90_71,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,


In [38]:
parent_stations_info.drop(columns=['parent_station','stop_code'], inplace=True)
parent_stations_info.head()

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id
0,est_4_279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1
1,est_4_278,LA GRANJA,Calle de Sepúlveda 1,40.5276,-3.65859,B1
2,est_4_280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1
3,est_4_281,MANUEL DE FALLA,Calle Manuel de Falla 59,40.55048,-3.64688,B1
4,est_4_57,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido


Introducimos los datos del dataframe parent_stations_info en la tabla Parent_Stations

In [39]:
# Iterar sobre cada fila del DataFrame para preparar y ejecutar las consultas SQL
for index, row in parent_stations_info.iterrows():
    query = f"""
    INSERT INTO `Parent_Stations` (station_id, name, description, latitude, longitude, zone_id)
    VALUES ('{row['stop_id']}', '{row['stop_name']}', '{row['stop_desc']}', {row['stop_lat']}, {row['stop_lon']}, '{row['zone_id']}');
    """
    connector.execute_query(query)

Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.


Preparamos un df para introducir a la tabla Stops sin redundar con la tabla Parent_stations

In [40]:
stops_to_insert = df_stops[~df_stops['stop_id'].isin(parent_stations_info['stop_id'])]

stops_to_insert['parent_station'] = stops_to_insert['parent_station'].fillna('NULL')
stops_to_insert.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stops_to_insert['parent_station'] = stops_to_insert['parent_station'].fillna('NULL')


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
0,par_4_284,284,HOSPITAL INFANTA SOFIA,Paseo Europa 11,40.55977,-3.61145,B1,
1,par_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
2,acc_4_279_684,279,Ascensor,Avda de la Ermita 5,40.53199,-3.63548,,est_4_279
3,par_4_283,283,REYES CATOLICOS,Avda de la Plaza de Toros 7,40.55037,-3.6234,B1,
4,par_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,


Introducimos los datos del dataframe stops_to_insert en la tabla Stops tratando con los nulos en parent_station ya que es un FK

In [41]:
# Iterar sobre cada fila del DataFrame para preparar y ejecutar las consultas SQL
for index, row in stops_to_insert.iterrows():
    # Verificar si parent_station debe ser tratado como NULL en SQL
    parent_station_value = 'NULL' if row['parent_station'] == 'NULL' else f"'{row['parent_station']}'"

    # Preparar la consulta SQL teniendo en cuenta el tratamiento correcto de NULL
    query = f"""
    INSERT INTO `Stops` (stop_id, stop_code, name, description, latitude, longitude, zone_id, parent_station_id)
    VALUES ('{row['stop_id']}', {row['stop_code']}, '{row['stop_name']}', '{row['stop_desc']}', {row['stop_lat']}, {row['stop_lon']}, '{row['zone_id']}', {parent_station_value});
    """

    connector.execute_query(query)

connector.close()


Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed

## Distritos
Creamos una tabla distritos para poder ubicar generalmente cada estacion de metro

In [42]:
queries =['''
DROP TABLE IF EXISTS Distritos;
''',
'''
CREATE TABLE Distritos (
  codigo_distrito int NOT NULL,
  nombre_distrito varchar(50) DEFAULT NULL,
  PRIMARY KEY (codigo_distrito)
);
''',

'''
INSERT INTO Distritos VALUES (0,'DESCONOCIDO'),(1,'CENTRO'),(2,'ARGANZUELA'),(3,'RETIRO'),(4,'SALAMANCA'),(5,'CHAMARTÍN'),(6,'TETUÁN'),(7,'CHAMBERÍ'),(8,'FUENCARRAL-EL PARDO'),(9,'MONCLOA-ARAVACA'),(10,'LATINA'),(11,'CARABANCHEL'),(12,'USERA'),(13,'PUENTE DE VALLECAS'),(14,'MORATALAZ'),(15,'CIUDAD LINEAL'),(16,'HORTALEZA'),(17,'VILLAVERDE'),(18,'VILLA DE VALLECAS'),(19,'VICÁLVARO'),(20,'SAN BLAS-CANILLEJAS'),(21,'BARAJAS');
'''
]

for query in queries:
    connector.execute_query(query)

Connected to MySQL database.
Query executed successfully.
Query executed successfully.
Query executed successfully.


### Agregamos columnas distritos a las tablas Parent_Stations y Stops

In [44]:
queries=["""
ALTER TABLE parent_stations
ADD COLUMN codigo_distrito int,
ADD CONSTRAINT fk_distrito_parents FOREIGN KEY (codigo_distrito) REFERENCES distritos(codigo_distrito)
""",
         """
ALTER TABLE stops
ADD COLUMN codigo_distrito int,
ADD CONSTRAINT fk_distrito_stops FOREIGN KEY (codigo_distrito) REFERENCES distritos(codigo_distrito)
"""]

for query in queries:
    connector.execute_query(query)

Query executed successfully.
Error:  1060 (42S21): Duplicate column name 'codigo_distrito'


# Codificación de radares

Como el archivo de radares ya está preparado, podemos insertarlo directamente

In [ ]:
df_radar = pd.read_csv('Datasets/RadaresFijos.csv')

In [31]:
i = 1
for row in df_radar.iterrows():
    distrito_query = f"""
        INSERT INTO RadaresFijos (id, NUM_RADAR, UBICACION, CARRETARA_O_VIAL, UBICACION_CALLE_30, PK, SENTIDO, TIPO, LONGITUD, LATITUD, COORDENADAS) VALUES ('{i}', '{row[1][0]}', '{row[1][1]}', '{row[1][2]}', '{row[1][3]}', '{row[1][4]}', '{row[1][5]}', '{row[1][6]}', '{row[1][7]}', '{row[1][8]}', '{row[1][9]}')
        """
    i = i + 1
    connector.execute_query(distrito_query)

AttributeError: 'NoneType' object has no attribute 'execute'

# Codificación Luminarias

In [50]:
df_luminarias = pd.read_csv('Datasets/iluminacion.csv', low_memory=False)

Eliminamos información que no necesitamos.
* Eliminamos Barrio porque no tenemos esa información en los accidentes, y aunque puede resultar útil, en la práctica aumenta mucho el tamaño de la base de datos y no merece mucho la pena.

* Se tiran las coordenadas individuales de cada farola porque consideramos que tiene una granularidad excesiva. No obstante, consideramos interesante cargarlo para realizar análisis de datos pero ya en RAM

* Las otras columnas no parecen tener alguna relación con los accidentes

In [51]:
df_luminarias = df_luminarias.drop(['BARRIO', 'TIPO_BLOQU', 'COD_NDP', 'X_UTM', 'Y_UTM'], axis=1)

Existen nulos en Via_par que han de ser rellenados, como solo es el prefijo (Calle, Avenida etc) es fácil

In [52]:
df_luminarias['VIA_PAR'] = df_luminarias['VIA_PAR'].fillna('')

Concadenamos toda la información para no tener tantas columnas

In [53]:
df_luminarias['localizacion'] = df_luminarias['VIA_CLASE'] + " " + df_luminarias['VIA_PAR'] + " " + df_luminarias['VIA_NOMBRE'].astype(str)

Eliminamos más información que consideramos que aumenta demasiado la granularidad de la información.

In [54]:
df_luminarias = df_luminarias.drop(['VIA_CLASE', 'VIA_PAR', 'VIA_CLASE', 'VIA_NOMBRE', 'CLASE_APP', 'NUMERO'], axis=1)

Codificamos Número y kilómetro

In [55]:
regex = 'NUMERO'
regex2 = 'KILÓMETRO'
df_luminarias['localizacion'] = df_luminarias['localizacion'].str.replace(regex, 'Nº')
df_luminarias['localizacion'] = df_luminarias['localizacion'].str.replace(regex2, 'KM')

creamos una columna dummy para poder guardar el número de farolas agrupadas allí

In [56]:
df_luminarias['num_luminarias'] = 0

Hacemos el group by utilizando todas las caracteristicas interesantes

In [57]:
df_luminarias = df_luminarias.groupby(['localizacion', 'DISTRITO', 'TIPO']).count().reset_index()

Codificamos el tipo de farola

In [62]:
df_luminarias['TIPO']=df_luminarias['TIPO'].replace({'DESCARGA': 'D', 'LED': 'L', 'LED-DESCARGA': 'E'})

In [72]:
df_luminarias.head()

Unnamed: 0,localizacion,DISTRITO,TIPO,num_luminarias
0,ACCESO A LA COLONIA MARCONI,17,D,6
1,ACCESO A LA COLONIA MARCONI,17,L,1
2,ACCESO AL PARQUE DE LA ELIPA,14,D,3
3,ACCESO AL PARQUE DE LA ELIPA,14,L,2
4,AUTOVÍA A-1,8,D,52


In [70]:
for index, row in df_luminarias.iterrows():
    sql = f"""insert into Luminarias (uuid, localizacion, cod_distrito, tipo, num_luminarias)
            values ('{uuid.uuid4()}', '{row['localizacion']}', {row['DISTRITO']},'{row['TIPO']}', {row['num_luminarias']});"""
    connector.execute_query(sql)

Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed

In [73]:
connector.close()

Connection closed.


# Consultas