## Importing the required libraries

In [None]:
import dask.dataframe as dd
import pandas as pd
import time
import numpy as np


## Using Dask to read the data 

In [None]:
import dask.dataframe as dd
start = time.time()
df = dd.read_csv("../Datasets/airline.csv",dtype={'CANCELLATION_CODE': 'object'})
end = time.time()
print(end - start)

In [None]:
df.shape[0].compute()

In [None]:
df.head(10)

In [None]:
df.columns

## Import the utility file and Cleaning the Columns

In [None]:
import testUtil

In [None]:
df.columns = testUtil.cleanColumnHeaders(df.columns)

In [None]:
df.head(2)

## Writing the Yaml file

In [None]:
%%writefile config.yaml
file_type: csv
dataset_name: airlineDelayData
file_name: airline_2014
inbound_delimiter: ","
outbound_delimiter: "|"
outbound_file_type: "gzip"
output_file: "Datasets/airlineDelayData2014-2018.gz"
columns:
  - fl_date
  - op_carrier
  - op_carrier_fl_num
  - origin
  - dest
  - crs_dep_time
  - dep_time
  - dep_delay
  - taxi_out
  - wheels_off
  - wheels_on
  - taxi_in
  - crs_arr_time
  - arr_time
  - arr_delay
  - cancelled
  - cancellation_code
  - diverted
  - crs_elapsed_time
  - actual_elapsed_time
  - air_time
  - distance
  - carrier_delay
  - weather_delay
  - nas_delay
  - security_delay
  - late_aircraft_delay


In [None]:
config_data = testUtil.read_config_file("config.yaml")

## Validate the new data file

In [None]:
testUtil.validate_dataset(config_data,"../Datasets/airline_2014.csv")

### Remove the unwanted column and test again

In [None]:
tempData = dd.read_csv("../Datasets/airline_2014.csv")

In [None]:
tempData = tempData.drop(['unnamed: 27'],axis=1)

In [None]:
tempData.to_csv("../Datasets/airline_2014_updated.csv", single_file=True, index=False)

## Validating the updated Dataset

In [None]:
testUtil.validate_dataset(config_data,"../Datasets/airline_2014_updated.csv")

## Reading the new Data using dask

In [None]:
airline_2014 = dd.read_csv("../Datasets/airline_2014_updated.csv")

In [None]:
airline_2014.info()

In [None]:
airline_2014.shape[0].compute()

## Merging the two Datasets

In [None]:
fullData = dd.concat([df,airline_2014])

In [None]:
fullData.shape[0].compute()

## Writing the data to a | seperated gz file

In [None]:
config_data["outbound_delimiter"]

In [None]:
config_data["outbound_file_type"]

In [None]:
config_data["output_file"]

In [None]:
df.to_csv(config_data["output_file"], single_file=True, sep=config_data["outbound_delimiter"], 
          compression=config_data["outbound_file_type"], index=False)

## Summary of final data