In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sat Aug 10 09:39:59 2019

@author: mor
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import dask
import dask.array as da
from dask import delayed
import time
import dask.dataframe as dd
import dask.bag as db
import glob
import json



In [2]:


# Define @delayed-function read_flights
@delayed
def read_flights(filename):

    # Read in the DataFrame: df
    df = pd.read_csv(filename,parse_dates=['FL_DATE'])

    # Replace 0s in df['WEATHER_DELAY'] with np.nan
    df['WEATHER_DELAY'] = df['WEATHER_DELAY'].replace(0,np.nan)

    # Return df
    return df



In [3]:

    
filenames=glob.glob('data/flightdelays/*.csv')
print(filenames)



['data/flightdelays\\flightdelays-2016-1.csv', 'data/flightdelays\\flightdelays-2016-2.csv', 'data/flightdelays\\flightdelays-2016-3.csv', 'data/flightdelays\\flightdelays-2016-4.csv', 'data/flightdelays\\flightdelays-2016-5.csv']


In [4]:


dataframes=[]   
# Loop over filenames with index filename
for filename in filenames:
    # Apply read_flights to filename; append to dataframes
    dataframes.append(read_flights(filename))

# Compute flight delays: flight_delays
flight_delays = dd.from_delayed(dataframes)

# Print average of 'WEATHER_DELAY' column of flight_delays
print(flight_delays['WEATHER_DELAY'].mean().compute())



51.29467680608365


In [5]:


# Define @delayed-function read_weather with input filename
@delayed
def read_weather(filename):
    # Read in filename: df
    df = pd.read_csv(filename,parse_dates=['Date'])

    # Clean 'PrecipitationIn'
    df['PrecipitationIn'] = pd.to_numeric(df['PrecipitationIn'], errors='coerce')

    # Create the 'Airport' column
    df['Airport'] = filename.split('.')[0][-3:]

    # Return df
    return df



In [6]:


filenames=sorted(glob.glob('data/weatherdata/???.csv'))


In [7]:

weather_dfs=[]
# Loop over filenames with filename
for filename in filenames:
    # Invoke read_weather on filename; append result to weather_dfs
    weather_dfs.append(read_weather(filename))

# Call dd.from_delayed() with weather_dfs: weather
weather = dd.from_delayed(weather_dfs)

# Print result of weather.nlargest(1, 'Max TemperatureF')
print(weather.nlargest(1, 'Max TemperatureF').compute())




          Date  Max TemperatureF  Mean TemperatureF  Min TemperatureF  \
224 2016-08-12               107                 93                79   

     Max Dew PointF  MeanDew PointF  Min DewpointF  Max Humidity  \
224              75              71             66            79   

     Mean Humidity  Min Humidity   ...     Mean VisibilityMiles  \
224             53            27   ...                        8   

     Min VisibilityMiles  Max Wind SpeedMPH  Mean Wind SpeedMPH  \
224                    0                 41                  10   

     Max Gust SpeedMPH  PrecipitationIn  CloudCover             Events  \
224               54.0             0.82           5  Rain-Thunderstorm   

     WindDirDegrees  Airport  
224             214      DFW  

[1 rows x 24 columns]


In [8]:


# Make cleaned Boolean Series from weather['Events']: is_snowy
is_snowy = weather['Events'].str.contains('Snow').fillna(False)

# Create filtered DataFrame with weather.loc & is_snowy: got_snow
got_snow = weather.loc[is_snowy]

# Groupby 'Airport' column; select 'PrecipitationIn'; aggregate sum(): result
result = got_snow.groupby('Airport')['PrecipitationIn'].sum()

# Compute & print the value of result
print(result.compute())



Airport
ATL    1.94
DEN    5.59
ORD    3.91
Name: PrecipitationIn, dtype: float64


In [9]:


weather_delays=dd.merge(flight_delays[['FL_DATE','ORIGIN','WEATHER_DELAY']],weather[['Date', 'Events' ,'Airport']]
                        ,left_on=['ORIGIN','FL_DATE'],right_on=['Airport','Date'],how='inner')



In [10]:

def percent_delayed(df):
    return (df['WEATHER_DELAY'].count() / len(df)) * 100



In [11]:


# Print time in milliseconds to compute percent_delayed on weather_delays
t_start = time.time()
print(percent_delayed(weather_delays).compute())
t_end = time.time()
print((t_end-t_start)*1000)

# Call weather_delays.persist(): persisted_weather_delays
persisted_weather_delays = weather_delays.persist()

# Print time in milliseconds to compute percent_delayed on persisted_weather_delays
t_start = time.time()
print(percent_delayed(persisted_weather_delays).compute())
t_end = time.time()
print((t_end-t_start)*1000)




1.304773408863712
133991.59574508667
1.304773408863712
6.914615631103516


In [12]:


# Group persisted_weather_delays by 'Events': by_event
by_event = persisted_weather_delays.groupby('Events')

# Count 'by_event['WEATHER_DELAY'] column & divide by total number of delayed flights
pct_delayed = by_event['WEATHER_DELAY'].count()/persisted_weather_delays['WEATHER_DELAY'].count()*100

# Compute & print five largest values of pct_delayed
print(pct_delayed.nlargest(5).compute())

# Calculate mean of by_event['WEATHER_DELAY'] column & return the 5 largest entries: avg_delay_time
avg_delay_time = by_event['WEATHER_DELAY'].mean().nlargest(5)

# Compute & print avg_delay_time
print(avg_delay_time.compute())





Events
Rain-Thunderstorm    34.237829
Snow                 13.647247
Rain                  7.023144
Fog-Snow              5.107741
Rain-Snow             3.591381
Name: WEATHER_DELAY, dtype: float64
Events
Rain-Snow                84.755556
Thunderstorm             56.000000
Fog-Rain                 55.000000
Rain-Thunderstorm        50.421911
Fog-Rain-Thunderstorm    37.769231
Name: WEATHER_DELAY, dtype: float64
