 # Descriptions
 In this notebook, we will load the previously downloaded files into the database. Thanks to this, we will create a central place for storing them, which we will use both for data analysis and the subsequent construction of a reporting system.

 > Note: Due to the volume of data contained in the `flight.csv` file, this notebook may take several dozen minutes or more to complete!

In [1]:
import pandas as pd
from psycopg2 import connect
from sqlalchemy import create_engine

 ## Connection to the database

In [2]:
username = 'postgres'
password = 'password'

host = 'localhost'
database = 'airlines'
port = 5432

In [3]:
url = f"postgresql://{username}:{password}@{host}:{port}/{database}"

engine = create_engine(url)

 # Loading data frames into the workspace

In [4]:
def load_raw_data(file_name):
    file = pd.read_csv(
        "../data/raw/" + file_name + ".csv",
        sep=';',
        decimal='.',
        encoding='UTF-8',
    )
    
    file.columns= file.columns.str.lower()

    return file

 # Loading individual files into data frames

In [5]:
aircraft_df = load_raw_data('aircraft')

airport_weather_df = load_raw_data('airport_weather')  

flight_df = load_raw_data('flight')  

airport_list_df = load_raw_data('airport_list')  


 # Data export to the database

In [6]:
def export_table_to_db(df, table_name):
    
    batch_size = 10**5
    df.to_sql(
        name=table_name,
        con=engine,
        if_exists='append',
        index=False,
        chunksize=batch_size
    )
    
    print(f'Loading data into {table_name}')

 ## Uploading data

 ###  `aircraft_df` to `aircraft`

In [7]:
export_table_to_db(aircraft_df, 'aircraft')

Loading data into aircraft


 ### `airport_weather_df` to `airport_weather`

In [8]:
export_table_to_db(airport_weather_df, 'airport_weather')

Loading data into airport_weather


 ### `flight_df` to `flight`
 > Making this cell will be time-consuming due to the amount of data in the data frame.

In [9]:
export_table_to_db(flight_df, 'flight')

Loading data into flight


 ### `airport_list_df` to`airport_list`

In [10]:
export_table_to_db(airport_list_df, 'airport_list')

Loading data into airport_list


 # Checking


In [11]:
def test_data_export(table_name, expected_count, expected_schema):
    real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]
    
    real_schema = pd.read_sql(f"SELECT * FROM {table_name} LIMIT 0", engine)
    real_schema = set(real_schema.columns)

    expected_schema = set(expected_schema)

    diff = real_schema.symmetric_difference(expected_schema)

    assert len(diff) == 0, ('Nie zgadzają się kolumny tabel....'
    f'\tOczekiwano: {expected_schema}'
    f'\tOtrzymano: {real_schema}'
    f'\tRóżnica: {diff}')

    assert expected_count == real_count, f'Nie zgadza się liczba wierszy, oczekiwano {expected_count}, otrzymano {real_count}'

 ## Checking `aircraft`

In [12]:
aircraft_expected_count = 7383
aircraft_expected_schema = ['id', 'manufacture_year', 'tail_num', 'number_of_seats']

test_data_export('aircraft', aircraft_expected_count, aircraft_expected_schema)

  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


 ## Checking `airport_weather`

In [13]:
airport_weather_expected_count = 46226
airport_weather_expected_schema = ['id', 'station', 'name', 'date', 'awnd', 'prcp', 'snow', 'snwd', 'tavg', 'tmax', 'tmin', 'wdf2', 'wdf5', 'wsf2', 'wsf5', 'wt01', 'wt08', 'wt02',
       'wt03', 'wt04', 'wt09', 'wt06', 'wt05', 'pgtm', 'wt10', 'wesd', 'sn32',
       'sx32', 'psun', 'tsun', 'tobs', 'wt07', 'wt11', 'wt18']

test_data_export('airport_weather', airport_weather_expected_count, airport_weather_expected_schema)

  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


 ## Checking`flight`


In [14]:
flight_expected_count = 9251880
flight_expected_schema = ['id', 'month', 'day_of_month', 'day_of_week', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'dest_airport_id',
       'crs_dep_time', 'dep_time', 'dep_delay_new', 'dep_time_blk',
       'crs_arr_time', 'arr_time', 'arr_delay_new', 'arr_time_blk',
       'cancelled', 'crs_elapsed_time', 'actual_elapsed_time', 'distance',
       'distance_group', 'year', 'carrier_delay', 'weather_delay', 'nas_delay',
       'security_delay', 'late_aircraft_delay']

test_data_export('flight', flight_expected_count, flight_expected_schema)

  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


 ## Checking `airport_list`

In [15]:
aircraft_list_expected_count = 97
aircraft_list_expected_schema = ['id', 'origin_airport_id', 'display_airport_name', 'origin_city_name', 'name']

test_data_export('airport_list', aircraft_list_expected_count, aircraft_list_expected_schema)


  real_count = pd.read_sql(f"SELECT COUNT(*) as cnt FROM {table_name}", engine).iloc[0][0]


In [16]:
msg = "Everything looks good OK :)"
print(msg)

Everything looks good OK :)
