### Imports

In [1]:
import pandas as pd
from my_connector import MySQLConnector
import numpy as np
import matplotlib as plt

## Leemos el dataset de paradas de metro

In [18]:
df_stops = pd.read_csv('./Datasets/stopsmetro.csv')
df_stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding
0,par_4_284,284,HOSPITAL INFANTA SOFIA,Paseo Europa 11,40.55977,-3.61145,B1,http://www.crtm.es,0,,Europe/Madrid,2
1,par_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,http://www.crtm.es,0,,Europe/Madrid,2
2,acc_4_279_684,279,Ascensor,Avda de la Ermita 5,40.53199,-3.63548,,http://www.crtm.es,2,est_4_279,,0
3,par_4_283,283,REYES CATOLICOS,Avda de la Plaza de Toros 7,40.55037,-3.6234,B1,http://www.crtm.es,0,,Europe/Madrid,2
4,par_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,http://www.crtm.es,0,,Europe/Madrid,2


 nos quitamos las columnas no deseadas

In [3]:
df_stops.drop(columns=['stop_timezone', 'wheelchair_boarding', 'stop_url', ' location_type'], inplace=True)

KeyError: "['stop_timezone', 'wheelchair_boarding', 'stop_url'] not found in axis"

Como está dando un error extraño, inspeccionamos las columnas para saber cual es el problema

In [4]:
# Inspeccionar los nombres de las columnas para identificar espacios adicionales o caracteres inusuales
column_names = [col.strip() for col in df_stops.columns]

# Mostrar los nombres de las columnas después de eliminar los espacios adicionales
column_names

['stop_id',
 'stop_code',
 'stop_name',
 'stop_desc',
 'stop_lat',
 'stop_lon',
 'zone_id',
 'stop_url',
 'location_type',
 'parent_station',
 'stop_timezone',
 'wheelchair_boarding']

Sustituimos los nombres para que coincidan y nos quitamos las columnas no deseadas

In [5]:
# Eliminar específicamente las columnas 'stop_timezone' y 'wheelchair_boarding' después de ajustar los nombres de las columnas
df_stops.columns = column_names
df_stops = df_stops.drop(['stop_timezone', 'wheelchair_boarding', 'stop_url', 'location_type'], axis=1)

# Mostrar las primeras filas del DataFrame corregido para confirmar la eliminación de las columnas
df_stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
0,par_4_284,284,HOSPITAL INFANTA SOFIA,Paseo Europa 11,40.55977,-3.61145,B1,
1,par_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
2,acc_4_279_684,279,Ascensor,Avda de la Ermita 5,40.53199,-3.63548,,est_4_279
3,par_4_283,283,REYES CATOLICOS,Avda de la Plaza de Toros 7,40.55037,-3.6234,B1,
4,par_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,


Vamos a realizar la conexion a la BBDD y ejecutar el query para montar la estructura relacional de la misma con tablas dcon la siguiente información:
1. **Tabla Stations**
Columnas:
station_id (VARCHAR, PRIMARY KEY): Basado en stop_id.
station_code (INT): Basado en stop_code.
name (VARCHAR): Basado en stop_name.
description (VARCHAR): Basado en stop_desc.
latitude (FLOAT): Basado en stop_lat.
longitude (FLOAT): Basado en stop_lon.
zone_id (VARCHAR): Basado en zone_id.

2. **Tabla LocationTypes**
Columnas:
location_type_id (INT, PRIMARY KEY)
description (VARCHAR)

3. **Tabla StationLocationTypes**
Columnas:
station_id (VARCHAR): Clave foránea de Stations.
location_type_id (INT): Clave foránea de LocationTypes.
4. **Tabla ParentStations**
Columnas:
parent_station_id (VARCHAR, PRIMARY KEY)
station_id (VARCHAR): Clave foránea de Stations.

In [6]:
# Crear una instancia del conector
connector = MySQLConnector(host="localhost", user="lucaramirezo", password="Familiaramirez4", database="tnt")
# Conectar a la base de datos
connector.connect()

# Lista de consultas SQL para crear las tablas
queries = [
    """
    CREATE TABLE `Parent Stations` (
    `station_id` VARCHAR(255) PRIMARY KEY,
    `name` VARCHAR(255),
    `description` VARCHAR(255),
    `latitude` DOUBLE,
    `longitude` DOUBLE,
    `zone_id` VARCHAR(255)
);

"""
    ,
    """
    CREATE TABLE `Stops` (
    `stop_id` VARCHAR(255) PRIMARY KEY,
    `stop_code` INT,
    `name` VARCHAR(255),
    `description` VARCHAR(255),
    `latitude` DOUBLE,
    `longitude` DOUBLE,
    `zone_id` VARCHAR(255),
    `parent_station_id` VARCHAR(255),
    FOREIGN KEY (`parent_station_id`) REFERENCES `Parent Stations`(`station_id`)
);
    """
]

# Ejecutar cada consulta para crear las tablas
for query in queries:
    connector.execute_query(query)

Connected to MySQL database.
Query executed successfully.
Query executed successfully.


Extraemos valores únicos de parent_station (excluyendo valores nulos o vacíos)

In [7]:
unique_parent_stations = df_stops[df_stops['parent_station'].notna()]['parent_station'].unique()
unique_parent_stations = pd.DataFrame(unique_parent_stations)
unique_parent_stations.rename(columns={0:'station_id'}, inplace=True)
unique_parent_stations

Unnamed: 0,station_id
0,est_4_279
1,est_4_57
2,est_4_281
3,est_4_280
4,est_90_18
5,est_4_278
6,est_4_142
7,est_90_25
8,est_4_50
9,est_90_71


Extraer información completa para las estaciones padre

In [8]:
# Extraer información completa para las estaciones parentales
parent_stations_info = df_stops[df_stops['stop_id'].isin(unique_parent_stations['station_id'])]
parent_stations_info

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
5,est_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
6,est_4_278,278,LA GRANJA,Calle de Sepúlveda 1,40.5276,-3.65859,B1,
17,est_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,
22,est_4_281,281,MANUEL DE FALLA,Calle Manuel de Falla 59,40.55048,-3.64688,B1,


Vemos que solo recibimos 4 entradas de estaciones padre que existen en el csv, pero hay más referenciadas. Procederemos a agregarlas a la tabla de estaciones padre para no perder la integridad de los datos y que estén relacionadas con sus estaciones padre

In [9]:
for station_id in unique_parent_stations['station_id']:
    if station_id not in parent_stations_info['stop_id'].values:
        # Crea un registro nuevo con valores nulos o predeterminados
        new_row = {'stop_id': station_id, 'stop_name': 'Nombre desconocido', 'stop_desc': 'Descripción no disponible',
                   'stop_lat': 0.0, 'stop_lon': 0.0, 'zone_id': 'Desconocido'}
        # Añade este nuevo registro a parent_stations_info
        parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
parent_stations_info

  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)
  parent_stations_info = parent_stations_info.append(new_row, ignore_index=True)


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
0,est_4_279,279.0,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
1,est_4_278,278.0,LA GRANJA,Calle de Sepúlveda 1,40.5276,-3.65859,B1,
2,est_4_280,280.0,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,
3,est_4_281,281.0,MANUEL DE FALLA,Calle Manuel de Falla 59,40.55048,-3.64688,B1,
4,est_4_57,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
5,est_90_18,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
6,est_4_142,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
7,est_90_25,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
8,est_4_50,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,
9,est_90_71,,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido,


In [10]:
parent_stations_info.drop(columns=['parent_station','stop_code'], inplace=True)
parent_stations_info.head()

Unnamed: 0,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id
0,est_4_279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1
1,est_4_278,LA GRANJA,Calle de Sepúlveda 1,40.5276,-3.65859,B1
2,est_4_280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1
3,est_4_281,MANUEL DE FALLA,Calle Manuel de Falla 59,40.55048,-3.64688,B1
4,est_4_57,Nombre desconocido,Descripción no disponible,0.0,0.0,Desconocido


Introducimos los datos del dataframe parent_stations_info en la tabla Parent stations

In [11]:
# Iterar sobre cada fila del DataFrame para preparar y ejecutar las consultas SQL
for index, row in parent_stations_info.iterrows():
    query = f"""
    INSERT INTO `Parent Stations` (station_id, name, description, latitude, longitude, zone_id)
    VALUES ('{row['stop_id']}', '{row['stop_name']}', '{row['stop_desc']}', {row['stop_lat']}, {row['stop_lon']}, '{row['zone_id']}');
    """
    connector.execute_query(query)

Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.


Preparamos un df para introducir a la tabla Stops sin redundar con la tabla Parent stations

In [12]:
stops_to_insert = df_stops[~df_stops['stop_id'].isin(parent_stations_info['stop_id'])]

stops_to_insert['parent_station'] = stops_to_insert['parent_station'].fillna('NULL')
stops_to_insert.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stops_to_insert['parent_station'] = stops_to_insert['parent_station'].fillna('NULL')


Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,parent_station
0,par_4_284,284,HOSPITAL INFANTA SOFIA,Paseo Europa 11,40.55977,-3.61145,B1,
1,par_4_279,279,LA MORALEJA,Avda de la Ermita 5,40.53196,-3.63556,B1,
2,acc_4_279_684,279,Ascensor,Avda de la Ermita 5,40.53199,-3.63548,,est_4_279
3,par_4_283,283,REYES CATOLICOS,Avda de la Plaza de Toros 7,40.55037,-3.6234,B1,
4,par_4_280,280,MARQUES DE LA VALDAVIA,Calle del Marqués de la Valdavia 21,40.54102,-3.63738,B1,


Introducimos los datos del dataframe stops_to_insert en la tabla Stops tratando con los nulos en parent_station ya que es un FK

In [13]:
# Iterar sobre cada fila del DataFrame para preparar y ejecutar las consultas SQL
for index, row in stops_to_insert.iterrows():
    # Verificar si parent_station debe ser tratado como NULL en SQL
    parent_station_value = 'NULL' if row['parent_station'] == 'NULL' else f"'{row['parent_station']}'"

    # Preparar la consulta SQL teniendo en cuenta el tratamiento correcto de NULL
    query = f"""
    INSERT INTO `Stops` (stop_id, stop_code, name, description, latitude, longitude, zone_id, parent_station_id)
    VALUES ('{row['stop_id']}', {row['stop_code']}, '{row['stop_name']}', '{row['stop_desc']}', {row['stop_lat']}, {row['stop_lon']}, '{row['zone_id']}', {parent_station_value});
    """

    connector.execute_query(query)


Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed successfully.
Query executed

Leemos ahora datos especificos en las tablas ejecutando sentencias como SELECT, UPDATE o VIEW

## Casos de Uso de las Tablas

### Consultar todas las paradas junto con su estación parental

In [15]:
query ="""
SELECT s.stop_id, s.name AS stop_name, s.latitude, s.longitude, p.station_id AS parent_station_id, p.name AS parent_station_name
FROM Stops s
LEFT JOIN `Parent Stations` p ON s.parent_station_id = p.station_id;
"""
df_padres_hijos = connector.fetch_data_as_df(query)
df_padres_hijos

Unnamed: 0,stop_id,stop_name,latitude,longitude,parent_station_id,parent_station_name
0,acc_4_278_685,La Granja,40.52763,-3.65865,est_4_278,LA GRANJA
1,acc_4_278_686,Ascensor,40.52772,-3.65855,est_4_278,LA GRANJA
2,acc_4_279_683,Avda. Bruselas,40.53187,-3.6355,est_4_279,LA MORALEJA
3,acc_4_279_684,Ascensor,40.53199,-3.63548,est_4_279,LA MORALEJA
4,acc_4_280_680,Marqués de la Valdavia,40.54112,-3.63739,est_4_280,MARQUES DE LA VALDAVIA
5,acc_4_280_681,Ascensor,40.54107,-3.63755,est_4_280,MARQUES DE LA VALDAVIA
6,acc_4_281_678,Manuel de Falla,40.55044,-3.64708,est_4_281,MANUEL DE FALLA
7,acc_4_281_679,Ascensor,40.55063,-3.64682,est_4_281,MANUEL DE FALLA
8,par_4_187,FUENCARRAL,40.49509,-3.69283,,
9,par_4_188,BEGOÑA,40.48041,-3.68585,,


### Encontrar paradas sin estación parental

In [16]:
query= '''
SELECT stop_id, name, description
FROM Stops
WHERE parent_station_id IS NULL;
'''
df_stops_without_parent= connector.fetch_data_as_df(query)
df_stops_without_parent

Unnamed: 0,stop_id,name,description
0,par_4_187,FUENCARRAL,Calle de la Fuente Chica 21
1,par_4_188,BEGOÑA,Paseo de la Castellana 261
2,par_4_191,CUZCO,Paseo de la Castellana 162
3,par_4_192,SANTIAGO BERNABEU,Paseo de la Castellana 97
4,par_4_199,LAGO,Ronda del Lago 3
5,par_4_200,BATAN,Paseo de la Venta 1
6,par_4_202,COLONIA JARDIN,Calle de Arenas de San Pedro 2
7,par_4_203,CUATRO VIENTOS,Col Militar de Cuatro Vientos 6 B
8,par_4_204,JOAQUIN VILUMBRALES,Avda de la Libertad 13
9,par_4_205,PUERTA DEL SUR,Avda de la Libertad 2
