# Análisis de vuelos
# Scraping y extracción

 En este laboratorio vamos a abordar la extracción de información mediante web scraping, para luego extraer esa información y almacenar en archivo csv.


## Instalación e Importación de librerías

In [1]:
!pip install xlsxwriter
!pip install tabulate

Collecting xlsxwriter
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading xlsxwriter-3.2.9-py3-none-any.whl (175 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/175.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/175.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.2.9


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime
import os

## Declaración de constantes

In [3]:
url = "https://failbondi.fail/?date="
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

year = "2025"

months_max_days = { "01": 31, "02": 28, "03": 31, "04": 30, "05": 31, "06": 30, "07": 31, "08": 31, "09": 30, "10": 31, "11": 30, "12": 31 }
month_days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11","12","13","14","15","16","17","18","19","20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]

## Funciones reutilizables

In [4]:
def get_html_from_url(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        return soup

    return None

def scraping_vuelos(html, date):
    """
    Extracts flight data from an HTML table and structures it into a DataFrame.

    Args:
        html (bs4.BeautifulSoup): The BeautifulSoup object containing the parsed HTML.
        date (str or datetime): The reference date to be associated with the scraped data.

    Returns:
        pd.DataFrame: A cleaned DataFrame containing flight records, including
                      standardized dates and month extraction for grouping.
    """
    # 1. Extract headers
    headers = [th.text.strip() for th in html.find('thead').find_all('th')]

    # 2. Extraer filas
    rows = []
    table_body = html.find('tbody')
    for tr in table_body.find_all('tr'):
        cells = [td.text.strip() for td in tr.find_all('td')]
        rows.append(cells)

    # 3. Crear DataFrame
    df = pd.DataFrame(rows, columns=headers)
    df['fecha'] = date
    df['fecha'] = pd.to_datetime(df['fecha'])
    df['mes'] = df['fecha'].dt.month

    return df


def get_report_by_month(year_month, max_days):
    lista_dfs = []
    for i in range(max_days):
        date = year_month + "-" + month_days[i]
        url_link = url + date
        main_content = get_html_from_url(url_link, headers)
        try:
            df_iteracion = scraping_vuelos(main_content, date)
            lista_dfs.append(df_iteracion)
        except Exception as e:
            print(f"Error en fecha {date}: {e}")

    df_month = pd.concat(lista_dfs, ignore_index=True)
    print(f"[{year_month}] - Filas obtenidas: {len(df_month)}")
    time.sleep(random.uniform(2, 5))
    print(df_month.head(5))
    return df_month


In [5]:
lista_dfs = []

inicio_peticion_total = time.time()

for month, max_days in months_max_days.items():
    year_month = year + "-" + month
    inicio_peticion = time.time()
    lista_dfs.append(get_report_by_month(year_month, max_days))

    fin_peticion = time.time()
    duracion = fin_peticion - inicio_peticion
    hora_actual = datetime.now().strftime('%H:%M:%S')

    print(f"[{hora_actual}] Finalizado: {year_month} | Tiempo: {duracion:.2f}s")

fin_peticion_total = time.time()
duracion_total = fin_peticion_total - inicio_peticion_total

print("Duración total del proceso: ", duracion_total)


[2025-01] - Filas obtenidas: 2205
     Vuelo                         Ruta Hora Programada Hora Real  \
0  FO 5912  Aeroparque → Rio de Janeiro           13:05             
1  FO 5237       Bariloche → Aeroparque           07:50     19:22   
2  FO 5236           Ezeiza → Bariloche           05:00     15:16   
3  FO 5027             Córdoba → Ezeiza           20:40  02:18 +1   
4  FO 5056         Aeroparque → Mendoza           16:00     21:27   

  Demora en despegar      fecha  mes  
0          Cancelado 2025-01-01    1  
1   11hs 32min tarde 2025-01-01    1  
2   10hs 16min tarde 2025-01-01    1  
3    5hs 38min tarde 2025-01-01    1  
4    5hs 27min tarde 2025-01-01    1  
[20:50:57] Finalizado: 2025-01 | Tiempo: 19.63s
[2025-02] - Filas obtenidas: 1880
     Vuelo                  Ruta Hora Programada Hora Real Demora en despegar  \
0  FO 5069  Mendoza → Aeroparque           21:59                    Cancelado   
1  FO 5055      Mendoza → Ezeiza           20:55                    Cance

In [6]:
def export_data(master_list, file_name, file_format='csv'):
    """
    Exports a list of DataFrames to CSV or Excel format.

    Args:
        master_list (list): The list of DataFrames to be saved.
        file_name (str): The name of the file (without extension).
        file_format (str): Desired output format, either 'csv' or 'excel'. Defaults to 'csv'.

    Returns:
        str: The path to the saved file if successful.
    """
    # Convert the list into a single Pandas DataFrame
    # flat_list = [record for sublist in master_list for record in sublist]
    df_year = pd.concat(master_list, ignore_index=True)
    print(df_year[:10])

    # Reference column used for filtering
    reference_column = 'Vuelo'

    # 3. Filtering: Keep only rows where the value is NOT equal to the header name
    df_clean = df_year[df_year[reference_column] != reference_column]
    df = pd.DataFrame(df_clean)
    file_path = ''

    try:
        if file_format.lower() == 'csv':
            file_path = f"{file_name}.csv"
            df.to_csv(file_path, index=False, encoding='utf-8-sig')
            print(f"✅ Archivo CSV guardado como: {file_path}")

        elif file_format.lower() == 'excel':
            file_path = f"{file_name}.xlsx"
            df.to_excel(file_path, index=False, engine='openpyxl')
            print(f"✅ Archivo Excel guardado como: {file_path}")

        else:
            print("❌ Formato no soportado. Usa 'csv' o 'excel'.")

    except Exception as e:
        print(f"Error al exportar: {e}")
    else:
        return file_path

In [8]:
ruta_archivo = export_data(lista_dfs, "reporte_final_anual", file_format='csv')
file_size = os.path.getsize(ruta_archivo)

print(f"\n¡Éxito! El archivo '{ruta_archivo}' ha sido creado. Tamaño: {file_size} bytes")

     Vuelo                         Ruta Hora Programada Hora Real  \
0  FO 5912  Aeroparque → Rio de Janeiro           13:05             
1  FO 5237       Bariloche → Aeroparque           07:50     19:22   
2  FO 5236           Ezeiza → Bariloche           05:00     15:16   
3  FO 5027             Córdoba → Ezeiza           20:40  02:18 +1   
4  FO 5056         Aeroparque → Mendoza           16:00     21:27   
5  FO 5057         Mendoza → Aeroparque           18:25     23:44   
6  FO 5472            Córdoba → Neuquen           16:15     21:01   
7  FO 5272           Ezeiza → Bariloche           12:55     16:13   
8  FO 5016         Aeroparque → Córdoba           15:45     19:01   
9  FO 5061             Mendoza → Ezeiza           21:30  00:26 +1   

  Demora en despegar      fecha  mes  
0          Cancelado 2025-01-01    1  
1   11hs 32min tarde 2025-01-01    1  
2   10hs 16min tarde 2025-01-01    1  
3    5hs 38min tarde 2025-01-01    1  
4    5hs 27min tarde 2025-01-01    1  
5    5

Este notebook finaliza con el archivo resultante de todos los vuelos que hay en la página de todo el año 2025.