In [1]:
import zipfile
import os
import pandas as pd


zip_file_path = os.path.join(os.getcwd(), "..", "data", "raw", "airports-database.zip")
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    with zip_ref.open("airports-database.csv") as csv_file:
        df = pd.read_csv(csv_file)

In [20]:
df.to_csv("../data/processed/airports-database.csv", index=False)

In [2]:
import os
from dotenv import load_dotenv


# API keys do arquivo .env
load_dotenv()
airportdb_key = os.getenv("AIRPORT_DB")
weatherbit_key = os.getenv("WEATHERBIT")

In [3]:
import requests


def get_airport_data(airport_code, api_token):
    """
    Obtém informações sobre um aeroporto a partir de seu código usando a API do AirportDB.

    Parameters
    ----------
    airport_code : str
        Código do aeroporto (ex.: 'JFK').
    api_token : str
        Token da API para autenticação.

    Returns
    -------
    dict
        Dados do aeroporto em formato JSON.

    Examples
    --------
    >>> airport_data = get_airport_data('JFK', 'seu_api_token_aqui')
    >>> print(airport_data)
    {'ident': 'KJFK', 'type': 'large_airport', 'name': 'John F Kennedy International Airport', ...}
    """
    url = f"https://airportdb.io/api/v1/airport/K{airport_code}?apiToken={api_token}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        response.raise_for_status()


def get_weather_history(lat, lon, start_date, end_date, api_key):
    """
    Obtém dados históricos de clima para uma localização específica usando a API do Weatherbit.

    Parameters
    ----------
    lat : float
        Latitude da localização (ex.: 40.7128).
    lon : float
        Longitude da localização (ex.: -74.0060).
    start_date : str
        Data de início no formato 'YYYY-MM-DD'.
    end_date : str
        Data de término no formato 'YYYY-MM-DD'.
    api_key : str
        Chave da API para autenticação.

    Returns
    -------
    dict
        Dados do clima em formato JSON.

    Examples
    --------
    >>> weather_data = get_weather_history(40.7128, -74.0060, '2023-01-01', '2023-01-02', 'seu_api_key_aqui')
    >>> print(weather_data)
    {'data': 'city_id': '5128581', 'city_name': 'New York City', 'country_code': 'US', ...}
    """
    url = "https://api.weatherbit.io/v2.0/history/daily"
    params = {
        'lat': lat,
        'lon': lon,
        'start_date': start_date,
        'end_date': end_date,
        'key': api_key
    }
    headers = {
        'Accept': 'application/json'
    }
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        response.raise_for_status()


def enrich_airport_data(df, api_token):
    """
    Enriquece um DataFrame com informações de latitude e longitude dos aeroportos.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame contendo a coluna 'airport_cod' com os códigos dos aeroportos.
    api_token : str
        Token da API para autenticação.

    Returns
    -------
    pandas.DataFrame
        DataFrame enriquecido com as colunas 'latitude_deg' e 'longitude_deg'.
    """
    latitudes = []
    longitudes = []

    for code in df["airport_cod"].unique().tolist():
        try:
            airport_data = get_airport_data(code, api_token)
            latitudes.append(airport_data.get('latitude_deg'))
            longitudes.append(airport_data.get('longitude_deg'))
        except requests.exceptions.RequestException as e:
            print(f"Erro ao obter dados do aeroporto {code}: {e}")
            latitudes.append(None)
            longitudes.append(None)

    df['latitude_deg'] = latitudes
    df['longitude_deg'] = longitudes
    return df


def get_unique_airports_by_date(df, type):
    """
    Obtém um DataFrame contendo aeroportos únicos por data.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame contendo os dados de voos.
    type : str
        Campo de origem ou destipo ('origin' ou 'dest').

    Returns
    -------
    pandas.DataFrame
        DataFrame contendo aeroportos únicos e suas respectivas datas.

    Examples
    --------
    >>> unique_airports = get_unique_airports_by_date(dataset, 'origin')
    >>> print(unique_airports)
      airport_cod        date
    0        DCA  2013-01-01
    1        HDN  2013-01-01
    """
    df = df[[type, "time_hour"]]
    df["time_hour"] = pd.to_datetime(df["time_hour"])
    df["date"] = df['time_hour'].dt.date
    df = df.drop(columns="time_hour")
    df.columns = ["airport_cod", "date"]
    df = df.drop_duplicates()
    return df


def enrich_weather_data(df, api_key):
    """
    Enriquece um DataFrame com informações de velocidade do vento para as coordenadas do aeroporto.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame contendo colunas de data, latitude e longitude do aeroporto.
    api_key : str
        Chave da API para autenticação.

    Returns
    -------
    pandas.DataFrame
        DataFrame enriquecido com a coluna 'wind_spd'.
    """
    wind_spd = []

    for index, row in df.iterrows():
        print(f"{index} de {df.shape[0]}...")
        date = row["date"]
        end_date = (pd.to_datetime(date) + pd.Timedelta(days=1)).strftime('%Y-%m-%d')

        try:
            if pd.notna(row["latitude_deg"]) and pd.notna(row["longitude_deg"]):
                weather_data = get_weather_history(row["latitude_deg"], row["longitude_deg"], date, end_date, api_key)
                wind_spd.append(weather_data["data"][0]["wind_spd"])
            else:
                wind_spd.append(None)
        except requests.exceptions.RequestException as e:
            print(f"Erro ao obter dados de clima (index {index}): {e}")
            wind_spd.append(None)

    df["wind_spd"] = wind_spd
    return df

In [4]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336776 entries, 0 to 336775
Data columns (total 21 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              336776 non-null  int64  
 1   year            336776 non-null  int64  
 2   month           336776 non-null  int64  
 3   day             336776 non-null  int64  
 4   dep_time        328521 non-null  float64
 5   sched_dep_time  336776 non-null  int64  
 6   dep_delay       328521 non-null  float64
 7   arr_time        328063 non-null  float64
 8   sched_arr_time  336776 non-null  int64  
 9   arr_delay       327346 non-null  float64
 10  carrier         336776 non-null  object 
 11  flight          336776 non-null  int64  
 12  tailnum         334264 non-null  object 
 13  origin          336776 non-null  object 
 14  dest            336776 non-null  object 
 15  air_time        327346 non-null  float64
 16  distance        336776 non-null  int64  
 17  hour      

Unnamed: 0,id,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,...,flight,tailnum,origin,dest,air_time,distance,hour,minute,time_hour,name
0,0,2013,1,1,517.0,515,2.0,830.0,819,11.0,...,1545,N14228,EWR,IAH,227.0,1400,5,15,2013-01-01 05:00:00,United Air Lines Inc.
1,1,2013,1,1,533.0,529,4.0,850.0,830,20.0,...,1714,N24211,LGA,IAH,227.0,1416,5,29,2013-01-01 05:00:00,United Air Lines Inc.
2,2,2013,1,1,542.0,540,2.0,923.0,850,33.0,...,1141,N619AA,JFK,MIA,160.0,1089,5,40,2013-01-01 05:00:00,American Airlines Inc.
3,3,2013,1,1,544.0,545,-1.0,1004.0,1022,-18.0,...,725,N804JB,JFK,BQN,183.0,1576,5,45,2013-01-01 05:00:00,JetBlue Airways
4,4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,...,461,N668DN,LGA,ATL,116.0,762,6,0,2013-01-01 06:00:00,Delta Air Lines Inc.


In [5]:
# códigos dos aeroportos
origin = df.origin.unique().tolist()
dest = df.dest.unique().tolist()
airports_cod = pd.DataFrame({"airport_cod": list(set(origin + dest))})

print(airports_cod.info())
airports_cod.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   airport_cod  107 non-null    object
dtypes: object(1)
memory usage: 984.0+ bytes
None


Unnamed: 0,airport_cod
0,TVC
1,SNA
2,LGB
3,EYW
4,LAX


In [6]:
# DataFrame com o lat/lon por aeroporto
latlon_airport_data_df = enrich_airport_data(airports_cod, airportdb_key)

print(latlon_airport_data_df.info())
latlon_airport_data_df.head()

Erro ao obter dados do aeroporto HNL: 404 Client Error: Not Found for url: https://airportdb.io/api/v1/airport/KHNL?apiToken=e1974b7ce50088c4bf8c096e8722010381ebe6e8aef06041cb3cb2901699c023addbc1da0ae7ff96a58434dad6b76639
Erro ao obter dados do aeroporto BQN: 404 Client Error: Not Found for url: https://airportdb.io/api/v1/airport/KBQN?apiToken=e1974b7ce50088c4bf8c096e8722010381ebe6e8aef06041cb3cb2901699c023addbc1da0ae7ff96a58434dad6b76639
Erro ao obter dados do aeroporto STT: 404 Client Error: Not Found for url: https://airportdb.io/api/v1/airport/KSTT?apiToken=e1974b7ce50088c4bf8c096e8722010381ebe6e8aef06041cb3cb2901699c023addbc1da0ae7ff96a58434dad6b76639
Erro ao obter dados do aeroporto PSE: 404 Client Error: Not Found for url: https://airportdb.io/api/v1/airport/KPSE?apiToken=e1974b7ce50088c4bf8c096e8722010381ebe6e8aef06041cb3cb2901699c023addbc1da0ae7ff96a58434dad6b76639
Erro ao obter dados do aeroporto SJU: 404 Client Error: Not Found for url: https://airportdb.io/api/v1/airport/K

Unnamed: 0,airport_cod,latitude_deg,longitude_deg
0,TVC,44.741402,-85.582199
1,SNA,33.675701,-117.867996
2,LGB,33.817699,-118.152
3,EYW,24.556101,-81.759598
4,LAX,33.942501,-118.407997


In [7]:
# DataFrame com aeroportos por data
origin_df = get_unique_airports_by_date(df, "origin")
dest_df = get_unique_airports_by_date(df, "dest")
unique_airports_by_date_df = pd.concat([origin_df, dest_df]).reset_index(drop=True)

print(unique_airports_by_date_df.info())
unique_airports_by_date_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["time_hour"] = pd.to_datetime(df["time_hour"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["date"] = df['time_hour'].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["time_hour"] = pd.to_datetime(df["time_hour"])
A value is trying to be set on a copy of a slice from a DataFrame.
T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32324 entries, 0 to 32323
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   airport_cod  32324 non-null  object
 1   date         32324 non-null  object
dtypes: object(2)
memory usage: 505.2+ KB
None


Unnamed: 0,airport_cod,date
0,EWR,2013-01-01
1,LGA,2013-01-01
2,JFK,2013-01-01
3,JFK,2013-01-02
4,EWR,2013-01-02


In [8]:
# DataFrame com data e lat/lon por aeroporto
airport_data_df = unique_airports_by_date_df.merge(latlon_airport_data_df, on="airport_cod")
airport_data_df = airport_data_df.dropna()

print(airport_data_df.info())
airport_data_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 30551 entries, 0 to 32315
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   airport_cod    30551 non-null  object 
 1   date           30551 non-null  object 
 2   latitude_deg   30551 non-null  float64
 3   longitude_deg  30551 non-null  float64
dtypes: float64(2), object(2)
memory usage: 1.2+ MB
None


Unnamed: 0,airport_cod,date,latitude_deg,longitude_deg
0,EWR,2013-01-01,40.692501,-74.168701
1,EWR,2013-01-02,40.692501,-74.168701
2,EWR,2013-01-03,40.692501,-74.168701
3,EWR,2013-01-04,40.692501,-74.168701
4,EWR,2013-01-05,40.692501,-74.168701


In [9]:
airport_data_df["date"] = pd.to_datetime(airport_data_df["date"])
airport_data_df["year_quarter"] = airport_data_df["date"].dt.to_period("Q")
result = airport_data_df.groupby(["airport_cod", "year_quarter"]).first().reset_index()
result


Unnamed: 0,airport_cod,year_quarter,date,latitude_deg,longitude_deg
0,ABQ,2013Q2,2013-04-22,35.040199,-106.609001
1,ABQ,2013Q3,2013-07-01,35.040199,-106.609001
2,ABQ,2013Q4,2013-10-01,35.040199,-106.609001
3,ACK,2013Q2,2013-05-16,41.253101,-70.060204
4,ACK,2013Q3,2013-07-01,41.253101,-70.060204
...,...,...,...,...,...
374,TYS,2013Q4,2013-10-01,35.811001,-83.994003
375,XNA,2013Q1,2013-01-01,36.281898,-94.306801
376,XNA,2013Q2,2013-04-01,36.281898,-94.306801
377,XNA,2013Q3,2013-07-01,36.281898,-94.306801


In [16]:
# get_weather_history(
#     lat=35.040199,
#     lon=-106.609001,
#     start_date="2015-04-22",
#     end_date="2015-04-23",
#     api_key=weatherbit_key
# )

HTTPError: 429 Client Error: Too Many Requests for url: https://api.weatherbit.io/v2.0/history/daily?lat=35.040199&lon=-106.609001&start_date=2015-04-22&end_date=2015-04-23&key=58912042febe4d579c7cf8f7a50f043a

In [11]:
# # dados de clima por aeroporto e respectiva data
# weather_data_df = enrich_weather_data(airport_data_df, weatherbit_key)

# print(weather_data_df.info())
# weather_data_df.head()