In [1]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

from config import db_password

import time

In [2]:
file_dir='.'

flightdata_file = f'{file_dir}/flightdelay_data.csv'

flightdata_df = pd.read_csv(flightdata_file, low_memory=False)
flightdata_df.head()



Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,0800-0859,2,1,25,143,Southwest Airlines Co.,13056,...,8,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
1,1,7,0,0700-0759,7,1,29,191,Delta Air Lines Inc.,13056,...,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
2,1,7,0,0600-0659,7,1,27,199,Delta Air Lines Inc.,13056,...,18,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
3,1,7,0,0600-0659,9,1,27,180,Delta Air Lines Inc.,13056,...,2,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0001-0559,7,1,10,182,Spirit Air Lines,13056,...,1,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91


In [3]:
# Create DataFrame that sorts flightdata_df on the "DEPARTING_AIRPORT" column.
sorted_df = flightdata_df.sort_values("DEPARTING_AIRPORT",ascending=True)

# Create DataFrame containing the "DEPARTING_AIRPORT", "LATTITUDE", AND "LONGITUDE" columns.
departing_df = pd.DataFrame(data=sorted_df, columns=["DEPARTING_AIRPORT", "LATITUDE", "LONGITUDE"])
departing_df.duplicated().value_counts

# Dropping duplicate rows from departing_df DataFrame.
departing_df = departing_df.drop_duplicates()

# Resetting the index in departing_df
departing_df = departing_df.reset_index(drop=True)
departing_df.head()

# Dropping the "LATITUDE" and "LONGITUDE" columns from the input dataset.
flightdata_droplatlng_df = flightdata_df.drop(columns=["LATITUDE", "LONGITUDE"])
flightdata_droplatlng_df.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,...,FLT_ATTENDANTS_PER_PASS,GROUND_SERV_PER_PASS,PLANE_AGE,DEPARTING_AIRPORT,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,0800-0859,2,1,25,143,Southwest Airlines Co.,13056,...,6.2e-05,9.9e-05,8,McCarran International,NONE,0.0,0.0,0.0,65.0,2.91
1,1,7,0,0700-0759,7,1,29,191,Delta Air Lines Inc.,13056,...,0.000144,0.000149,3,McCarran International,NONE,0.0,0.0,0.0,65.0,2.91
2,1,7,0,0600-0659,7,1,27,199,Delta Air Lines Inc.,13056,...,0.000144,0.000149,18,McCarran International,NONE,0.0,0.0,0.0,65.0,2.91
3,1,7,0,0600-0659,9,1,27,180,Delta Air Lines Inc.,13056,...,0.000144,0.000149,2,McCarran International,NONE,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0001-0559,7,1,10,182,Spirit Air Lines,13056,...,9e-06,0.000125,1,McCarran International,NONE,0.0,0.0,0.0,65.0,2.91


In [4]:
# Writing the input dataset without the "LATITUDE" and "LONGITUDE" columns to a csv file.
#file_dir='.'

#flightdata_nolatlng_file = f'{file_dir}/flightdelay_nolatlng_data.csv'

flightdata_droplatlng_df.to_csv('flightdelay_nolatlng_data.csv', encoding="utf-8", index=False)


In [5]:

# Connect to pgAdmin database & set up to write into db name "Flighdelay_data-1".

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Flightdelay_data-1"
engine = create_engine(db_string)

# Writing "departing_df" containing the 96 "DEPARTING_AIRPORTS"  and their "lATITUDE" and "LONGITUDE" columns to "airports" table

departing_df.to_sql(name='airports', con=engine, if_exists='replace', index=False)
    
    
    

In [8]:
# Writing flightdelay_nolatlng_data.csv to table "fltdeltabnolatlng" into Flightdelay_data-1 database & calculating load times.
rows_imported = 0
start_time = time.time()
for data in pd.read_csv('flightdelay_data.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='delaytable', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

importing rows 0 to 1000000...Done. 123.9223952293396 total seconds elapsed
importing rows 1000000 to 2000000...Done. 245.80833768844604 total seconds elapsed
importing rows 2000000 to 3000000...Done. 367.31092596054077 total seconds elapsed
importing rows 3000000 to 4000000...Done. 490.0491998195648 total seconds elapsed
importing rows 4000000 to 5000000...Done. 610.7951285839081 total seconds elapsed
importing rows 5000000 to 6000000...Done. 733.0542676448822 total seconds elapsed
importing rows 6000000 to 6489062...Done. 795.7553715705872 total seconds elapsed


In [None]:
# Writegit  delay_clean.csv to table "clean_delaytable" in Flightdelay_data-1 database & calculating load times.
rows_imported = 0
start_time = time.time()
for data in pd.read_csv('delay_clean.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='clean_delaytable', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

In [9]:
# Write flightdelay_nolatlng_data.csv to table "fltdeltabnolatlng" into Flightdelay_data-1 database & calculating load times.
rows_imported = 0
start_time = time.time()
for data in pd.read_csv('flightdelay_nolatlng_data.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='fltdeltabnolatlng', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

importing rows 0 to 1000000...Done. 111.41214084625244 total seconds elapsed
importing rows 1000000 to 2000000...Done. 221.43075370788574 total seconds elapsed
importing rows 2000000 to 3000000...Done. 333.37523007392883 total seconds elapsed
importing rows 3000000 to 4000000...Done. 444.39389419555664 total seconds elapsed
importing rows 4000000 to 5000000...Done. 555.0871002674103 total seconds elapsed
importing rows 5000000 to 6000000...Done. 666.9523801803589 total seconds elapsed
importing rows 6000000 to 6489062...Done. 722.0680227279663 total seconds elapsed
