In [24]:
import requests
import datetime
import time
import json
import pandas as pd
import numpy as np

In [72]:
## Global variable

YEAR = 2017

In [2]:
## Number of days in a month
def days_in_month(year, month):
    """
    Input:
        :int year
        :int month
    Output: 
        :int :number of days in a month
    """
    if month in [1, 3, 5, 7, 8, 10, 12]:
        return 31
    elif month in [4, 6, 9, 11]:
        return 30
    elif (year % 4 == 0) and (year % 100 != 0) or (year % 400 == 0):
        return 29
    else:
        return 28

## Query daily weather data of Chicago
Data available at [API](https://darksky.net/dev/docs)

In [3]:
# Get API key (registration needed)
with open("api_key_darksky", 'r') as f:
    api_key = f.readline().strip()

# GPS coordinates of Chicago
lat = '41.836944'
long = '-87.684722'

def get_weather_from_darksky(year, month, day):
    """
    Input:
        :int year
        :int month
        :int day
    Output:
        :dict :dictionary of weather data from web api
    """
    # Get unixtime
    dt = datetime.datetime(year, month, day)
    unixtime = str(int(time.mktime(dt.timetuple())))
    
    # Request darksky.net for data
    r = requests.get('https://api.darksky.net/forecast/'+api_key+'/'+lat+','+long+','+unixtime)

    return json.loads(r.text)    

In [89]:
# Query data for a month and save to file
def save_weather_per_month(year, month):
    """
    Input:
        :int year
        :int month
    Output:
        :void
    """
    month_dict = {}
    
    days = days_in_month(year, month)
    
    for day in range(1, days+2):
        daily_dict = get_weather_from_darksky(year, month, day)
        month_dict[str(year)+'-'+str(month)+'-'+str(day)] = daily_dict
        
    with open("weather_data/"+str(year)+'-'+str(month)+'.json', 'w') as jf:
        json.dump(month_dict, jf)

In [5]:
# ## Query and save year 2017 weather data from darksky

# for m in range(1, 13):
#     save_weather_per_month(YEAR, m)

## Read data from saved file

In [150]:
# drop info that won't be needed
labels_to_drop = ['time', 'summary', 'temperatureMin', 'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime']

# dataframe for yearly weather data
weather_df = pd.DataFrame()

# save data into dataframe
for m in range(1, 13):
    with open("weather_data/"+str(YEAR)+"-"+str(m)+".json", "r") as jf:
        data = json.load(jf)
    for d in range(1, 32):
        try:
            data_dict = data[str(YEAR)+'-'+str(m)+'-'+str(d)]['daily']['data'][0]
            data_series = pd.Series(data_dict).drop(labels=labels_to_drop)
            data_series['day'] = d
            data_series['month'] = m
            data_series['year'] = YEAR
            weather_df = weather_df.append(data_series, ignore_index=True)
        except:
            pass

In [151]:
# Post processing: fill missing data
weather_df['precipType'] = weather_df['precipType'].fillna('None')
weather_df['precipAccumulation'] = weather_df['precipAccumulation'].fillna(0)
weather_df['precipIntensityMaxTime'] = weather_df['precipIntensityMaxTime'].fillna(-1)

In [152]:
# Convert unix time stamp
def convert_unix_timestamp(unixtime):
    """
    Input:
        :float unixtime
    Output:
        :float converted time
    """
    if unixtime == -1:
        return 0
    
    dt = datetime.datetime.fromtimestamp(unixtime)
    
    return dt.hour+dt.minute/60

for tc in ['apparentTemperatureHighTime', 'apparentTemperatureLowTime', 'apparentTemperatureMaxTime', 'apparentTemperatureMinTime',
           'temperatureHighTime', 'temperatureLowTime', 'uvIndexTime', 'windGustTime', 
           'sunsetTime', 'sunriseTime', 'precipIntensityMaxTime']:
    weather_df[tc] = weather_df[tc].apply(lambda x: convert_unix_timestamp(x))

In [153]:
# Add daylight_duration column
weather_df['daylight_duration'] = weather_df['sunsetTime'] - weather_df['sunriseTime']

In [158]:
# One hot encode categorical features
weather_df = pd.get_dummies(weather_df, columns=['icon', 'precipType'])

In [169]:
# Save to csv file
weather_df = weather_df.set_index(['month', 'day', 'year'])
weather_df.to_csv('weather_data/weather_2017_chicago.csv')

In [172]:
weather_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,apparentTemperatureHigh,apparentTemperatureHighTime,apparentTemperatureLow,apparentTemperatureLowTime,apparentTemperatureMax,apparentTemperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,cloudCover,dewPoint,...,daylight_duration,icon_clear-day,icon_cloudy,icon_fog,icon_partly-cloudy-day,icon_partly-cloudy-night,icon_rain,precipType_None,precipType_rain,precipType_snow
month,day,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1.0,1.0,2017.0,39.58,15.0,30.56,2.0,39.58,15.0,20.62,6.0,0.13,21.29,...,9.2,0,0,0,0,1,0,1,0,0
1.0,2.0,2017.0,39.17,13.0,38.69,4.0,39.18,21.0,30.56,2.0,0.75,32.83,...,9.216667,0,0,1,0,0,0,0,1,0
1.0,3.0,2017.0,39.72,10.0,-1.21,8.0,39.72,10.0,13.97,23.0,0.96,34.84,...,9.233333,0,0,1,0,0,0,0,1,0
1.0,4.0,2017.0,8.19,15.0,-3.91,3.0,9.79,0.0,-1.21,8.0,0.18,5.19,...,9.25,0,0,0,0,1,0,1,0,0
1.0,5.0,2017.0,3.0,15.0,-9.56,5.0,3.0,15.0,-4.38,23.0,0.72,-0.07,...,9.266667,0,0,0,1,0,0,1,0,0


In [159]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 44 columns):
apparentTemperatureHigh        365 non-null float64
apparentTemperatureHighTime    365 non-null float64
apparentTemperatureLow         365 non-null float64
apparentTemperatureLowTime     365 non-null float64
apparentTemperatureMax         365 non-null float64
apparentTemperatureMaxTime     365 non-null float64
apparentTemperatureMin         365 non-null float64
apparentTemperatureMinTime     365 non-null float64
cloudCover                     365 non-null float64
day                            365 non-null float64
dewPoint                       365 non-null float64
humidity                       365 non-null float64
month                          365 non-null float64
moonPhase                      365 non-null float64
precipIntensity                365 non-null float64
precipIntensityMax             365 non-null float64
precipProbability              365 non-null float64
pressure   