In [2]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import datetime
%matplotlib inline

os.listdir('datasets/')

['avy_X-y.csv',
 'forecasts_no_dups.csv',
 'clean_weather_data.csv',
 'timberline_cleaned_weather.csv',
 'cleaner_forecast_data.csv']

In [3]:
forecasts = pd.read_csv('datasets/forecasts_no_dups.csv')
forecasts.forecast_for_date = pd.to_datetime(forecasts.forecast_for_date)

In [4]:
# set forecast to 12:00pm - on average, peak accuracy for a given forecast would be at noon although it varies...
forecasts.set_index(forecasts.forecast_for_date.apply(lambda x: x + datetime.timedelta(hours=12)), inplace=True)

In [5]:
forecasts.index

DatetimeIndex(['2013-12-11 12:00:00', '2013-12-12 12:00:00',
               '2013-12-13 12:00:00', '2013-12-14 12:00:00',
               '2013-12-15 12:00:00', '2013-12-16 12:00:00',
               '2013-12-17 12:00:00', '2013-12-18 12:00:00',
               '2013-12-19 12:00:00', '2013-12-20 12:00:00',
               ...
               '2017-11-28 12:00:00', '2017-11-29 12:00:00',
               '2017-11-30 12:00:00', '2017-12-01 12:00:00',
               '2017-12-02 12:00:00', '2017-12-03 12:00:00',
               '2017-12-04 12:00:00', '2017-12-05 12:00:00',
               '2017-12-06 12:00:00', '2017-12-07 12:00:00'],
              dtype='datetime64[ns]', name='forecast_for_date', length=547, freq=None)

In [6]:
weather = pd.read_csv('datasets/clean_weather_data.csv', index_col='Date/Time (PST)')
weather.index = pd.to_datetime(weather.index)

In [7]:
weather.index

DatetimeIndex(['2014-09-23 09:00:00', '2014-09-23 10:00:00',
               '2014-09-23 11:00:00', '2014-09-23 12:00:00',
               '2014-09-23 13:00:00', '2014-09-23 14:00:00',
               '2014-09-23 15:00:00', '2014-09-23 16:00:00',
               '2014-09-23 17:00:00', '2014-09-23 18:00:00',
               ...
               '2017-12-08 03:00:00', '2017-12-08 04:00:00',
               '2017-12-08 05:00:00', '2017-12-08 06:00:00',
               '2017-12-08 07:00:00', '2017-12-08 08:00:00',
               '2017-12-08 09:00:00', '2017-12-08 10:00:00',
               '2017-12-08 11:00:00', '2017-12-08 12:00:00'],
              dtype='datetime64[ns]', name='Date/Time (PST)', length=28094, freq=None)

In [8]:
combined = weather.join(forecasts)
combined.columns

Index(['Temperature_m', 'Relative_Humidity_m', 'Temperature_tl',
       'Relative_Humidity_tl', 'Wind_Speed_Average_tl',
       'Wind_Speed_Maximum_tl', 'Wind_Direction_(deg.)_tl', 'precip', 'depth',
       'pressure', 'forecast_for_date', 'above', 'below', 'discussion',
       'sizes', 'octagons', 'near', 'likelihoods', 'problems', 'storm_slabs',
       'wet_slabs', 'persistent', 'wind_slab', 'loose_wet', 'glide',
       'cornices', 'loose_dry'],
      dtype='object')

In [9]:
forecast_cols = ['above', 'below', 'near', 'storm_slabs', 'wet_slabs', 'persistent', 'wind_slab', 'loose_wet', 'glide',
'cornices', 'loose_dry',]
for col in forecast_cols:
    combined[col] = combined[col].interpolate(limit=24)

In [10]:
combined.count()

Temperature_m               28096
Relative_Humidity_m         28096
Temperature_tl              28096
Relative_Humidity_tl        28096
Wind_Speed_Average_tl       28096
Wind_Speed_Maximum_tl       28096
Wind_Direction_(deg.)_tl    28096
precip                      28096
depth                       28096
pressure                    28096
forecast_for_date             414
above                        9911
below                        9911
discussion                    414
sizes                         414
octagons                      414
near                         9911
likelihoods                   414
problems                      414
storm_slabs                  9911
wet_slabs                    9911
persistent                   9911
wind_slab                    9911
loose_wet                    9911
glide                        9911
cornices                     9911
loose_dry                    9911
dtype: int64

In [11]:
combined.discussion.fillna(method='ffill', limit=24, inplace=True)  # propagate rest of forecast over whole day

In [12]:
combined.count()

Temperature_m               28096
Relative_Humidity_m         28096
Temperature_tl              28096
Relative_Humidity_tl        28096
Wind_Speed_Average_tl       28096
Wind_Speed_Maximum_tl       28096
Wind_Direction_(deg.)_tl    28096
precip                      28096
depth                       28096
pressure                    28096
forecast_for_date             414
above                        9911
below                        9911
discussion                   9911
sizes                         414
octagons                      414
near                         9911
likelihoods                   414
problems                      414
storm_slabs                  9911
wet_slabs                    9911
persistent                   9911
wind_slab                    9911
loose_wet                    9911
glide                        9911
cornices                     9911
loose_dry                    9911
dtype: int64

In [None]:
# let's see how ugly it is to extract additional data from images: sizes and octagons
import matplotlib.pyplot as plt
from PIL import Image
import requests
import json
from io import BytesIO
%matplotlib inline

In [None]:
print(len(combined.octagons.unique()))
combined.octagons.unique()

In [None]:
print(len(combined.sizes.unique()))  # 50 discrete urls
combined.sizes.unique()

In [None]:
combined.sizes[-1]  # json doesn't like single-quotes

In [None]:
imgs = json.loads(combined.sizes[-1].replace('\'', '\"'))
imgs[0]

In [None]:
data = requests.get(imgs[0]).content
img = Image.open(BytesIO(data))

In [None]:
plt.imshow(img)   # would be easier with sizes to extract information from the filename;

In [None]:
oct_url = 'https://www.nwac.us/avalanche-forecast/octagon/problem/1985.png' # from col 'octagons'
octagon = requests.get(oct_url).content
oct_img = Image.open(BytesIO(octagon))
plt.imshow(oct_img)
# too many different octagons to worry about this right now... 

In [13]:
# 'discussion' is full html files; we just made one-hot for 'problems' (which is a list)
# if we're dropping that we can also drop the images associated with each item in problems...
combined.drop(['forecast_for_date', 'problems', 'likelihoods', 'octagons', 'sizes'], axis=1, inplace=True)

In [14]:
# save this dataset to file to use elsewhere
combined.to_csv('datasets/avy_X-y.csv')