### Imports

In [39]:
#!pip install pydantic==2.0.3

In [40]:
import os
import requests
import zipfile
import pandas as pd

PARENT_PATH = os.path.dirname(os.getcwd())

In [41]:
from pydantic import BaseModel
from pydantic import Field
from pydantic import field_validator
from typing import Optional

---
### Class Definition

In [42]:
class FlightDataAnalyzer(BaseModel):    # Project Instructions were unclear about the class name
    """
    Initilizes a tool for analyzing commercial airflight data to support
    sustainability studies. When the class is called, the flight data is
    automatically downloaded and stored into pandas dataframes (after
    removing superfluous columns).
    
    Attributes
    ----------
    airlines: pd.DataFrame
        Dataframe with airline information such as Name, IATA and ICAO
        codes, Country, Active status, etc
    airplanes: pd.DataFrame
        Dataframe with Name and IATA and ICAO codes of airplanes
    airports: pd.DataFrame
        Dataframe with airport information, such as Name, City, Country,
        IATA and ICAO codes, Latitude & Longitude, Timezone, etc.
    routes: pd.DataFrame
        Dataframe with route information, such as Airline, Source Airport,
        Destination Airport, number of Stops, etc.
               
    Methods
    --------
    ..()
        ..
    """
    airlines: Optional[pd.DataFrame] = None
    airplanes: Optional[pd.DataFrame] = None
    airports: Optional[pd.DataFrame] = None
    routes: Optional[pd.DataFrame] = None

    class Config:   # Include this to allow pandas dataframes as attributes
        arbitrary_types_allowed = True

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._download()

    def _download(self):
        """
        Downloads the commercial airflight datasets into downloads/
        directory, and stores it into pandas dataframes. If data is 
        already downloaded, it won't be redownloaded. Automatically
        called when class is initialized.
        
        Parameters
        -----------
        self: class
            The FlightDataAnalyzer class itself
            
        Returns
        --------
        Nothing. Defines attributes for the FlightDataAnalyzer, one for
        each dataset:
        - self.airlines: airlines.csv
        - self.airplanes: airplanes.csv
        - self.airports: airports.csv
        - self.routes: routes.csv
        """
        # Check if the directory exists
        DATA_DIRECTORY = os.path.join(PARENT_PATH, "downloads")
        if not os.path.exists(DATA_DIRECTORY):
            os.makedirs(os.path.join(PARENT_PATH, "downloads"))
            print("Created downloads directory")
        
        # Check if the .zip file exists\
        DATA_PATH = os.path.join(DATA_DIRECTORY, "flight_data.zip")
        if not os.path.isfile(DATA_PATH):
            DATA_URL = "https://gitlab.com/adpro1/adpro2024/-/raw/main/Files/flight_data.zip?inline=false"
            try:
                response = requests.get(DATA_URL)
                response.raise_for_status() # raises an HTTPError for http requests status codes between 400 and 600

                # Save downloaded file
                with open(DATA_PATH, "wb") as f:
                    f.write(response.content)
                    print(f"Data downloaded and saved to {DATA_DIRECTORY}")
            
            except requests.RequestException as error:
                print(f"Failed to download data: {error}")
        else:
            print("Data already downloaded")
        
        # Load the data into pandas dataframes and store the useful columns in class attributes
        with zipfile.ZipFile(DATA_PATH, "r") as z:
            with z.open("airlines.csv") as f:
                self.airlines = pd.read_csv(f, index_col=0).drop(
                    "Alias", axis=1)

            with z.open("airplanes.csv") as f:
                self.airplanes = pd.read_csv(f, index_col=0)

            with z.open("airports.csv") as f:
                self.airports = pd.read_csv(f, index_col=0).drop(
                    ["Type", "Source"], axis=1)

            with z.open("routes.csv") as f:
                self.routes = pd.read_csv(f, index_col=0)

---
### Testing Functionalities

In [43]:
# Testing HTTP request
test_request = requests.get("https://gitlab.com/adpro1/adpro2024/-/raw/main/Files/flight_data.zip?inline=false")
print(test_request.content)
print(test_request.status_code)

b'PK\x03\x04-\x00\x00\x00\x08\x00\xfa|\x86U\xe2R"\xa0\xff\xff\xff\xff\xff\xff\xff\xff\x0c\x00\x14\x00airlines.csv\x01\x00\x10\x00\xec]\x05\x00\x00\x00\x00\x00\x94\x1b\x02\x00\x00\x00\x00\x00\x94\xbd\xdbv\xea\xcc\x925x_O\xa1\xab\xda\xbb\xc6H\xef\xe6,\xe9R\x80\x00\x19!aI`\xe3\xd172\x96m\xbe\x85\xc1\xc5\xc1\xeb\xf3z\xa3\xbe\xea\x87\xf8_\xacgdFJ)`\xd7\xa8\x1e\x03\xd9k\x19\xa5\x0e\x193##fFDnv\xaf\xc5\xdf\xc2\xdb\x1c\xb6\x9b]a\x05C\x11\xe5\x9f\x85\xf0\xb6\x9b\xfc(\x02/\xf3D0\xf0b1\xc8\xb7\xdb\xe3\xe6}\'\x06\xfb\xf3\xeet\xf8\x11\xde\xfa\xb4\xf9.\xfe\xa3!\xee\x9ab\xb1\xfb\xb5\xdb\xff\xde\x89\xff;\x12w\x82~\xe2\xb3\xfa\x8f\xa6h\x8a\xf9a\xf3\x9d\x9f\n\xebm\xbby\xff8\xf1\t\x02_\xb6DK4\xdb]\x0b\xf7\xfd\x9d\xff\x1c\xe9\x1b1\x8eB1\xf6#?\xf1B\\rs*^\xad\xf4\x84\xd6G\x11\xfdG[\xb4E3\xdb|\x16\x16?)\xb5hf"\x89\x9eD\xe4?e\xc1\xcc\x17\xe9\xfe|\xfa\xb0\xbc\xb7\xc3f\x9d\xe3\x16\x1d\xd1\x11-+\xfd\xef\x9d\x15\xed\xad\xa6\xe5o\x8b\xcfbw\xca\x0f?\xd6h\xfb\xb3\xd9\xbd[\xd9!\xdf\xec\xe8\x1f\xe9\xfac\xbf\xdf\xcagx\

In [44]:
# Testing Save
with open("day1-request-test.zip", "wb") as f:
    f.write(test_request.content)
# opened in file explorer and zip file saved correctly

In [45]:
# Testing HTTP error (delete part of url)
test_error = requests.get("https://gitlab.com/adpro1/adpro2024/-/raw/main/File")
print(test_error.status_code)
test_error.raise_for_status()

404


HTTPError: 404 Client Error: Not Found for url: https://gitlab.com/adpro1/adpro2024/-/raw/main/File

In [46]:
# Test try and except
try:
    test_error.raise_for_status()
except requests.RequestException as error:
    print(f"Failed to download data: {error}")

Failed to download data: 404 Client Error: Not Found for url: https://gitlab.com/adpro1/adpro2024/-/raw/main/File


In [47]:
# Test reading the zip file and storing a csv file into a pandas dataframe
with zipfile.ZipFile("..\\downloads\\flight_data.zip", "r") as z:
    print(z.namelist())
    with z.open("airlines.csv") as f:
        display(pd.read_csv(f))

['airlines.csv', 'airplanes.csv', 'airports.csv', 'routes.csv']


Unnamed: 0,index,Airline ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,0,-1,Unknown,\N,-,,\N,\N,Y
1,1,1,Private flight,\N,-,,,,Y
2,2,2,135 Airways,\N,,GNL,GENERAL,United States,N
3,3,3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
4,4,4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N
...,...,...,...,...,...,...,...,...,...
6157,6157,21248,GX Airlines,,,CBG,SPRAY,China,Y
6158,6158,21251,Lynx Aviation (L3/SSX),,,SSX,Shasta,United States,N
6159,6159,21268,Jetgo Australia,,JG,\N,,Australia,Y
6160,6160,21270,Air Carnival,,2S,\N,,India,Y


In [48]:
# Call the class without directory
test = FlightDataAnalyzer()
display(test_1.airlines.head())

Data already downloaded


Unnamed: 0_level_0,Airline ID,Name,IATA,ICAO,Callsign,Country,Active
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-1,Unknown,-,,\N,\N,Y
1,1,Private flight,-,,,,Y
2,2,135 Airways,,GNL,GENERAL,United States,N
3,3,1Time Airline,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,,WYT,,United Kingdom,N


In [49]:
# Call class with directory but not data
test = FlightDataAnalyzer()

Data already downloaded


In [50]:
# Call class with directory and data ready
test = FlightDataAnalyzer()

Data already downloaded


---
### Inspect Data (to see what columns are important)

In [51]:
with zipfile.ZipFile("..\\downloads\\flight_data.zip", "r") as z:
    for file in z.namelist():
        with z.open(file) as f:
            display(pd.read_csv(f))

Unnamed: 0,index,Airline ID,Name,Alias,IATA,ICAO,Callsign,Country,Active
0,0,-1,Unknown,\N,-,,\N,\N,Y
1,1,1,Private flight,\N,-,,,,Y
2,2,2,135 Airways,\N,,GNL,GENERAL,United States,N
3,3,3,1Time Airline,\N,1T,RNX,NEXTIME,South Africa,Y
4,4,4,2 Sqn No 1 Elementary Flying Training School,\N,,WYT,,United Kingdom,N
...,...,...,...,...,...,...,...,...,...
6157,6157,21248,GX Airlines,,,CBG,SPRAY,China,Y
6158,6158,21251,Lynx Aviation (L3/SSX),,,SSX,Shasta,United States,N
6159,6159,21268,Jetgo Australia,,JG,\N,,Australia,Y
6160,6160,21270,Air Carnival,,2S,\N,,India,Y


Unnamed: 0,index,Name,IATA code,ICAO code
0,0,Aerospatiale (Nord) 262,ND2,N262
1,1,Aerospatiale (Sud Aviation) Se.210 Caravelle,CRV,S210
2,2,Aerospatiale SN.601 Corvette,NDC,S601
3,3,Aerospatiale/Alenia ATR 42-300,AT4,AT43
4,4,Aerospatiale/Alenia ATR 42-500,AT5,AT45
...,...,...,...,...
241,241,Tupolev Tu-144,\N,T144
242,242,Tupolev Tu-154,TU5,T154
243,243,Tupolev Tu-204,T20,T204
244,244,Yakovlev Yak-40,YK4,YK40


Unnamed: 0,index,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone,Type,Source
0,0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.081690,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.207080,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826790,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.443380,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7693,7693,14106,Rogachyovo Air Base,Belaya,Russia,\N,ULDA,71.616699,52.478298,272,\N,\N,\N,airport,OurAirports
7694,7694,14107,Ulan-Ude East Airport,Ulan Ude,Russia,\N,XIUW,51.849998,107.737999,1670,\N,\N,\N,airport,OurAirports
7695,7695,14108,Krechevitsy Air Base,Novgorod,Russia,\N,ULLK,58.625000,31.385000,85,\N,\N,\N,airport,OurAirports
7696,7696,14109,Desierto de Atacama Airport,Copiapo,Chile,CPO,SCAT,-27.261200,-70.779198,670,\N,\N,\N,airport,OurAirports


Unnamed: 0,index,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
0,0,2B,410,AER,2965,KZN,2990,,0,CR2
1,1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,4,2B,410,CEK,2968,OVB,4078,,0,CR2
...,...,...,...,...,...,...,...,...,...,...
67658,67658,ZL,4178,WYA,6334,ADL,3341,,0,SF3
67659,67659,ZM,19016,DME,4029,FRU,2912,,0,734
67660,67660,ZM,19016,FRU,2912,DME,4029,,0,734
67661,67661,ZM,19016,FRU,2912,OSS,2913,,0,734


---
### Test the final output

In [52]:
display(test.airlines, test.airplanes, test.airports, test.routes)

Unnamed: 0_level_0,Airline ID,Name,IATA,ICAO,Callsign,Country,Active
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,-1,Unknown,-,,\N,\N,Y
1,1,Private flight,-,,,,Y
2,2,135 Airways,,GNL,GENERAL,United States,N
3,3,1Time Airline,1T,RNX,NEXTIME,South Africa,Y
4,4,2 Sqn No 1 Elementary Flying Training School,,WYT,,United Kingdom,N
...,...,...,...,...,...,...,...
6157,21248,GX Airlines,,CBG,SPRAY,China,Y
6158,21251,Lynx Aviation (L3/SSX),,SSX,Shasta,United States,N
6159,21268,Jetgo Australia,JG,\N,,Australia,Y
6160,21270,Air Carnival,2S,\N,,India,Y


Unnamed: 0_level_0,Name,IATA code,ICAO code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Aerospatiale (Nord) 262,ND2,N262
1,Aerospatiale (Sud Aviation) Se.210 Caravelle,CRV,S210
2,Aerospatiale SN.601 Corvette,NDC,S601
3,Aerospatiale/Alenia ATR 42-300,AT4,AT43
4,Aerospatiale/Alenia ATR 42-500,AT5,AT45
...,...,...,...
241,Tupolev Tu-144,\N,T144
242,Tupolev Tu-154,TU5,T154
243,Tupolev Tu-204,T20,T204
244,Yakovlev Yak-40,YK4,YK40


Unnamed: 0_level_0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz database time zone
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.081690,145.391998,5282,10,U,Pacific/Port_Moresby
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.207080,145.789001,20,10,U,Pacific/Port_Moresby
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826790,144.296005,5388,10,U,Pacific/Port_Moresby
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.443380,147.220001,146,10,U,Pacific/Port_Moresby
...,...,...,...,...,...,...,...,...,...,...,...,...
7693,14106,Rogachyovo Air Base,Belaya,Russia,\N,ULDA,71.616699,52.478298,272,\N,\N,\N
7694,14107,Ulan-Ude East Airport,Ulan Ude,Russia,\N,XIUW,51.849998,107.737999,1670,\N,\N,\N
7695,14108,Krechevitsy Air Base,Novgorod,Russia,\N,ULLK,58.625000,31.385000,85,\N,\N,\N
7696,14109,Desierto de Atacama Airport,Copiapo,Chile,CPO,SCAT,-27.261200,-70.779198,670,\N,\N,\N


Unnamed: 0_level_0,Airline,Airline ID,Source airport,Source airport ID,Destination airport,Destination airport ID,Codeshare,Stops,Equipment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2
...,...,...,...,...,...,...,...,...,...
67658,ZL,4178,WYA,6334,ADL,3341,,0,SF3
67659,ZM,19016,DME,4029,FRU,2912,,0,734
67660,ZM,19016,FRU,2912,DME,4029,,0,734
67661,ZM,19016,FRU,2912,OSS,2913,,0,734


---
### Run black and pylint & Test mypy

In [7]:
!black ..\tests\day_1\class_download_pylint.py

reformatted ..\final_scripts\day_1\class_flight_data.py

All done! ✨ 🍰 ✨
1 file reformatted.


In [1]:
# First pylint run (implement fixes in the final_scripts file) - some issues found
!pylint ..\tests\day_1\class_download_pylint.py

************* Module class_download_pylint
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\tests\day_1\class_download_pylint.py:84:0: C0301: Line too long (106/100) (line-too-long)
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\tests\day_1\class_download_pylint.py:47:4: C0115: Missing class docstring (missing-class-docstring)
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\tests\day_1\class_download_pylint.py:47:4: R0903: Too few public methods (0/2) (too-few-public-methods)
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\tests\day_1\class_download_pylint.py:76:8: C0103: Variable 

In [4]:
# Second pylint run - list of fixes detailed below
!pylint ..\final_scripts\day_1\class_flight_data.py

"""
List of fixes:
- Shortened the lines over 100 characters 
- Added timeout parameter to the requests.get() call (limited to 15 sec)
- Rearranged the order of library imports, first standard libraries then
3rd party libraries
- Removed unused imports (Field and field_validator from pydantic)

Ignored suggestions:
- C0103: Variable name "DATA_DIRECTORY" doesn't conform to snake_case naming style (invalid-name)
    Since it is defined as a constant, it is actually compliant with PEP 8 (same with other paths)
- R0903: Too few public methods (0/2) (too-few-public-methods)
    More methods will be added in future phases
- C0115: Missing class docstring (missing-class-docstring)
    Docstring is present after the class definition
"""

************* Module class_flight_data
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\final_scripts\day_1\class_flight_data.py:47:4: C0115: Missing class docstring (missing-class-docstring)
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\final_scripts\day_1\class_flight_data.py:47:4: R0903: Too few public methods (0/2) (too-few-public-methods)
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\final_scripts\day_1\class_flight_data.py:76:8: C0103: Variable name "DATA_DIRECTORY" doesn't conform to snake_case naming style (invalid-name)
c:\Users\marti\OneDrive - Nova SBE\Nova\Master's in Business Analytics\Disciplinas\2nd Semester\t1\Advanced Programming for Data Science\ADPRO_Project\fi

'\nList of fixes:\n- Shortened the lines over 100 characters \n- Added timeout parameter to the requests.get() call (limited to 15 sec)\n- Rearranged the order of library imports, first standard libraries then\n3rd party libraries\n- Removed unused imports (Field and field_validator from pydantic)\n\nIgnored suggestions:\n- C0103: Variable name "DATA_DIRECTORY" doesn\'t conform to snake_case naming style (invalid-name)\n    Since it is defined as a constant, it is actually compliant with PEP 8 (same with other paths)\n- R0903: Too few public methods (0/2) (too-few-public-methods)\n    More methods will be added in future phases\n- C0115: Missing class docstring (missing-class-docstring)\n    Docstring is present after the class definition\n'

In [1]:
# mypy - no issues
!mypy ..\tests\day_1\class_download_mypy.py

[1m[92mSuccess: no issues found in 1 source file[0m
