In [1]:
# import libraries
import pandas as pd
import glob
import datetime
import numpy as np

In [2]:
# prepare data for databases
print("Preparing data for databases...")

# get all files from extracted_data/db/
files = glob.glob("../extracted_data/db/*.csv")
files.sort()

for file in files:
    i = file.split("/")[3].split('.')[0]
    print("Phase " + str(i) + " is running...")
    
    # load csv file to dataframe
    df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")
    df = df.iloc[:, :61]
    df.columns = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'DOT_ID_Reporting_Airline',
       'IATA_CODE_Reporting_Airline', 'Tail_Number',
       'Flight_Number_Reporting_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'Origin', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'Dest',
       'DestCityName', 'DestState', 'DestStateFips', 'DestStateName',
       'DestWac', 'CRSDepTime', 'DepTime', 'DepDelay', 'DepDelayMinutes',
       'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut',
       'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay',
       'ArrDelayMinutes', 'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk',
       'Cancelled', 'CancellationCode', 'Diverted', 'CRSElapsedTime',
       'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 'DistanceGroup',
       'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
       'LateAircraftDelay']
    
    # add timestamp column to dataframe
    df['EntryDate'] = df['Year'].apply(lambda x: datetime.datetime.now()) # better to create real data
    # df['EntryDate'] = np.where(True, datetime.datetime.now(), 0) # better to raise performance
    
    # create index for mongodb and mysql
    idx_mongodb = []
    idx_mysql = []
    for j in df.index.values:
        if j%2==0:
            idx_mongodb.append(j)
        else:
            idx_mysql.append(j)
    
    # initialize mongodb files
    print("Initializing mongodb files...")
    df_mongodb = df.filter(items = idx_mongodb, axis=0)
    df_mongodb.to_csv("../extracted_data/mongodb/db/" + str(i) + ".csv", index=False)
    
    # initialize mysql files
    print("Initializing mysql files...")
    df_mysql = df.filter(items = idx_mysql, axis=0)
    df_mysql.to_csv("../extracted_data/mysql/db/" + str(i) + ".csv", index=False)
    
    print("Phase " + str(i) + " is completed.")
    
print("Data for databases are all prepared.\n")

Preparing data for databases...
Phase 2001 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2001 is completed.
Phase 2002 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2002 is completed.
Phase 2003 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2003 is completed.
Phase 2004 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2004 is completed.
Phase 2005 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2005 is completed.
Phase 2006 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2006 is completed.
Phase 2007 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2007 is completed.
Phase 2008 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2008 is completed.
Phase 2009 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2009 is completed.
Phase 2010 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2010 is completed.
Phase 2011 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2011 is completed.
Phase 2012 is running...


  df = pd.read_csv("../extracted_data/db/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2012 is completed.
Data for databases are all prepared.



In [3]:
# prepare data for streaming
print("Preparing data for streaming...")

# get all files from extracted_data/streaming/
files = glob.glob("../extracted_data/streaming/*.csv")
files.sort()

for file in files:
    i = file.split("/")[3].split('.')[0]
    print("Phase " + str(i) + " is running...")
    
    # load csv file to dataframe
    df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")
    df = df.iloc[:, :61]
    df.columns = ['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'DOT_ID_Reporting_Airline',
       'IATA_CODE_Reporting_Airline', 'Tail_Number',
       'Flight_Number_Reporting_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'Origin', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'Dest',
       'DestCityName', 'DestState', 'DestStateFips', 'DestStateName',
       'DestWac', 'CRSDepTime', 'DepTime', 'DepDelay', 'DepDelayMinutes',
       'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut',
       'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay',
       'ArrDelayMinutes', 'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk',
       'Cancelled', 'CancellationCode', 'Diverted', 'CRSElapsedTime',
       'ActualElapsedTime', 'AirTime', 'Flights', 'Distance', 'DistanceGroup',
       'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
       'LateAircraftDelay']
    
    # add timestamp column to dataframe
    df['EntryDate'] = df['Year'].apply(lambda x: datetime.datetime.now()) # better to create real data
    # df['EntryDate'] = np.where(True, datetime.datetime.now(), 0) # better to raise performance
    
    # create index for mongodb and mysql
    idx_mongodb = []
    idx_mysql = []
    for j in df.index.values:
        if j%2==0:
            idx_mongodb.append(j)
        else:
            idx_mysql.append(j)
    
    # initialize mongodb files
    print("Initializing mongodb files...")
    df_mongodb = df.filter(items = idx_mongodb, axis=0)
    df_mongodb.to_csv("../extracted_data/mongodb/streaming/" + str(i) + ".csv", index=False)
    
    # initialize mysql files
    print("Initializing mysql files...")
    df_mysql = df.filter(items = idx_mysql, axis=0)
    df_mysql.to_csv("../extracted_data/mysql/streaming/" + str(i) + ".csv", index=False)
    
    print("Phase " + str(i) + " is completed.")

print("Data for streaming are all prepared.")

Preparing data for streaming...
Phase 2101 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2101 is completed.
Phase 2102 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2102 is completed.
Phase 2103 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2103 is completed.
Phase 2104 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2104 is completed.
Phase 2105 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2105 is completed.
Phase 2106 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2106 is completed.
Phase 2107 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2107 is completed.
Phase 2108 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2108 is completed.
Phase 2109 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2109 is completed.
Phase 2110 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2110 is completed.
Phase 2111 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2111 is completed.
Phase 2112 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2112 is completed.
Phase 2201 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2201 is completed.
Phase 2202 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2202 is completed.
Phase 2203 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2203 is completed.
Phase 2204 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2204 is completed.
Phase 2205 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2205 is completed.
Phase 2206 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2206 is completed.
Phase 2207 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2207 is completed.
Phase 2208 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2208 is completed.
Phase 2209 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2209 is completed.
Phase 2210 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2210 is completed.
Phase 2211 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2211 is completed.
Phase 2212 is running...


  df = pd.read_csv("../extracted_data/streaming/" + str(i) + ".csv")


Initializing mongodb files...
Initializing mysql files...
Phase 2212 is completed.
Data for streaming are all prepared.
