## Carga de librerías y datasets

In [134]:
# 02_preprocess.ipynb
import pandas as pd
import numpy as np
from pathlib import Path

RAW = Path("C:/Users/Martin/Documents/GitHub/Aeropuertos/data/raw")

s1_routes = pd.read_csv(RAW / "S1_routes.csv")
s1_airlines = pd.read_csv(RAW / "S1_airlines.csv")
s1_airports = pd.read_csv(RAW / "S1_airports.csv")
s2_airports = pd.read_csv(RAW / "S2_airports.csv")
s2_rawflightdata = pd.read_csv(RAW / "S2_raw-flight-data.csv")
s2_flights = pd.read_csv(RAW / "S2_flights.csv")

In [135]:
# Encabezado del set de datos
s1_routes.head()

Unnamed: 0,Airline,Airline ID,Source Airport,Source Airport ID,Destination Airport,Destination Airport ID,Codeshare,Stops,Equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2


In [136]:
# Información básica del set de datos
s1_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67663 entries, 0 to 67662
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Airline                 67663 non-null  object
 1   Airline ID              67663 non-null  object
 2   Source Airport          67663 non-null  object
 3   Source Airport ID       67663 non-null  object
 4   Destination Airport     67663 non-null  object
 5   Destination Airport ID  67663 non-null  object
 6   Codeshare               14597 non-null  object
 7   Stops                   67663 non-null  int64 
 8   Equipment               67645 non-null  object
dtypes: int64(1), object(8)
memory usage: 4.6+ MB


In [137]:
# contar datos únicos
s1_routes.nunique()

Airline                    568
Airline ID                 548
Source Airport            3409
Source Airport ID         3321
Destination Airport       3418
Destination Airport ID    3327
Codeshare                    1
Stops                        2
Equipment                 3942
dtype: int64

In [138]:
s1_routes1 = s1_routes
# Sumar datos nulos por columna
s1_routes.isnull().sum()

Airline                       0
Airline ID                    0
Source Airport                0
Source Airport ID             0
Destination Airport           0
Destination Airport ID        0
Codeshare                 53066
Stops                         0
Equipment                    18
dtype: int64

In [139]:
# Ver filas s1_routes['Equipment'].isnull()
s1_routes[s1_routes['Equipment'].isnull()]

Unnamed: 0,Airline,Airline ID,Source Airport,Source Airport ID,Destination Airport,Destination Airport ID,Codeshare,Stops,Equipment
2963,7S,\N,RSH,7098,ANI,5967,,0,
2964,7S,\N,SHX,7090,ANI,5967,,0,
3571,9E,3976,ATL,3682,MSP,3858,,0,
3583,9E,3976,GFK,3442,TVF,7018,,0,
3593,9E,3976,MSP,3858,ATL,3682,,0,
3602,9E,3976,TVF,7018,GFK,3442,,0,
4722,AA,24,AUS,3673,MSY,3861,,0,
5909,AA,24,LIR,1881,SJO,1885,,0,
6147,AA,24,MSY,3861,AUS,3673,,0,
14417,BA,1355,ATL,3682,DFW,3670,,0,


In [140]:
s1_routes = s1_routes1
# Eliminar 'Codeshare' por cantidad exesiva de nulos
s1_routes = s1_routes.drop(columns=['Codeshare'])

In [141]:
# Limpieza: normalizar espacios, convertir '' a NA, partir y explotar
s1_routes['Equipment'] = s1_routes['Equipment'].astype(str).str.replace(r'\s+', ' ', regex=True).str.strip()
s1_routes['Equipment'].replace('', pd.NA, inplace=True)

# Explotar usando split por whitespace (no deja tokens vacíos)
s1_routes = s1_routes.assign(Equipment=s1_routes['Equipment'].dropna().str.split()).explode('Equipment')

# Asegurar limpieza final y eliminar nulos
s1_routes['Equipment'] = s1_routes['Equipment'].astype(str).str.strip()
s1_routes.loc[s1_routes['Equipment'] == '', 'Equipment'] = pd.NA
s1_routes= s1_routes.loc[s1_routes['Equipment']!='nan']
s1_routes = s1_routes.dropna(subset=['Equipment']).reset_index(drop=True)
s1_routes.info()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  s1_routes['Equipment'].replace('', pd.NA, inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93231 entries, 0 to 93230
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Airline                 93231 non-null  object
 1   Airline ID              93231 non-null  object
 2   Source Airport          93231 non-null  object
 3   Source Airport ID       93231 non-null  object
 4   Destination Airport     93231 non-null  object
 5   Destination Airport ID  93231 non-null  object
 6   Stops                   93231 non-null  int64 
 7   Equipment               93231 non-null  object
dtypes: int64(1), object(7)
memory usage: 5.7+ MB


In [142]:
# Guardar dataset limpio en interim
INTERIM = Path("C:/Users/Martin/Documents/GitHub/Aeropuertos/data/interim")
s1_routes.to_csv(INTERIM / "S1_routes_clean.csv", index=False)