# Snowfall data EDA

In this notebook we are simply going to convert the data files into a format suitable for building a model.

In [1]:
import numpy as np
import pandas as pd
from plotnine import *

import itertools as it
import functools as ft
import os

In [2]:
def map_list(*args, **kwargs):
    return list(map(*args, **kwargs))

def map_dfr(*args, **kwargs):
    return pd.concat(map_list(*args, **kwargs))

In [3]:
data_dir = '../../data/colorado_snow/'

files = os.listdir(data_dir)
cities = map_list(lambda x: x[0].split('_')[1], map(os.path.splitext, files))

def read_snow_csv(city):
    city_df = pd.read_csv(os.path.join(data_dir, f'Stndata_{city}.csv'),
                          skiprows = 1, names=['date', 'temp_max', 'temp_min', 'rain', 'snow'])
    city_df['city'] = city
    return city_df

snow_df = map_dfr(read_snow_csv, cities)
print(snow_df.shape)
snow_df.head()

(61369, 6)


Unnamed: 0,date,temp_max,temp_min,rain,snow,city
0,2000-01-01,44,16,0.04,0.1,aspen
1,2000-01-02,36,14,0.22,2.0,aspen
2,2000-01-03,32,18,0.13,2.5,aspen
3,2000-01-04,29,-3,0.0,0.0,aspen
4,2000-01-05,29,M,0.12,1.8,aspen


There are some weird missing data rules:
M: no or missing observations
T: non-measurable trace of precipitation or snow
S: for precipitation or snowfall, look to a subsequent day for information
A: for precipitation or snowfall, this value is a multi-day accumulation 

In [4]:
snow_df['snow'].loc[snow_df['snow'].isin(['M', 'T', 'S', 'A'])].value_counts()

snow
M    6484
T    2818
Name: count, dtype: int64

In [5]:
snow_df['snow'] = snow_df['snow'].replace({'M': np.nan, 'T': '0.01'})
snow_df['snow'] = snow_df['snow'].astype('float')
snow_df['snow'].describe()

count    54885.000000
mean         0.369192
std          1.276374
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         30.000000
Name: snow, dtype: float64

In [6]:
snow_df['date'] = pd.to_datetime(snow_df['date'], format = '%Y-%m-%d')
snow_df['year'] = snow_df['date'].dt.year
snow_df['month'] = snow_df['date'].dt.month
snow_df['day_of_week'] = snow_df['date'].dt.day_of_week
snow_df['day_of_year'] = snow_df['date'].dt.day_of_year
snow_df.head()

Unnamed: 0,date,temp_max,temp_min,rain,snow,city,year,month,day_of_week,day_of_year
0,2000-01-01,44,16,0.04,0.1,aspen,2000,1,5,1
1,2000-01-02,36,14,0.22,2.0,aspen,2000,1,6,2
2,2000-01-03,32,18,0.13,2.5,aspen,2000,1,0,3
3,2000-01-04,29,-3,0.0,0.0,aspen,2000,1,1,4
4,2000-01-05,29,M,0.12,1.8,aspen,2000,1,2,5


In [7]:
annual_snowfall_df = (
    snow_df.
    groupby(['city', 'year']).
    apply(lambda df: pd.Series(dict(
        days_with_snow = np.sum(df['snow'] > 0.01),
        total_snow = np.sum(df['snow'])
    ))).
    reset_index()
)

annual_snowfall_df.head()

Unnamed: 0,city,year,days_with_snow,total_snow
0,aspen,2000,61.0,140.1
1,aspen,2001,56.0,141.72
2,aspen,2002,60.0,132.67
3,aspen,2003,69.0,164.15
4,aspen,2004,54.0,138.71


In [8]:
#ggplot(annual_snowfall_df, aes(x = 'year', y = 'total_snow')) + geom_point()

In order to convert the data into a snow season time series that begins on november 1st we need to control for leap years. Or, just take a date difference from november 1st of that year....

In [9]:
snow_season_df = snow_df.query('(month > 10) | (month < 4)').query('(year > 2000) | (month > 4)').copy()
print(snow_season_df.shape)
## if its after the new year move the anchor back
snow_season_df['year_season'] = snow_season_df['year'] - 1 * (snow_season_df['month'] < 4)
snow_season_df['season_start_date'] = pd.to_datetime(snow_season_df['year_season'].apply(lambda x: f'11/01/{x}'))
snow_season_df['day_of_season'] = (snow_season_df['date'] - snow_season_df['season_start_date']).dt.days + 1

(24780, 10)


In [10]:
print(snow_season_df['day_of_season'].max())
print(np.sum(snow_season_df['day_of_season'] == snow_season_df['day_of_season'].max()))
snow_season_df = snow_season_df.query('day_of_season < 152')
print(snow_season_df.shape)

152
35
(24745, 13)


In [11]:
def complete(df, col_list, fill = dict()):
    all_combos = it.product(*[df[c].unique() for c in col_list])
    all_combos_df = pd.DataFrame(all_combos, columns = col_list)
    print(all_combos_df.shape)
    
    new_df = pd.merge(all_combos_df, df, on = col_list, how = 'left')
    if len(fill) > 0:
        new_df.fillna(fill, inplace = True)
    
    return new_df.loc[:, df.columns]

def all_equal(col):
    if len(col.unique()) == 1:
        return True
    else:
        return False

# snow_all_df = complete(snow_season_df, ['city', 'date'], fill = {'snow': 0})
# print(snow_season_df.shape)
# print(snow_all_df.shape)
snow_season_df.query('year_season < 2023').groupby(['year_season', 'city']).date.count().reset_index().pipe(lambda df: all_equal(df['date']))

True

In [12]:
# (ggplot(monthly_sum_df, aes(x = 'month_of_season', y = 'total_snow')) + 
#   geom_line(monthly_sum_df.query('(year == 2023) & (month > 10)'), color = 'white', size = 3) +
#   geom_line(aes(group = 'year', color = 'year'), alpha = .5))

In [13]:
def bitmap_encoding(cat_col):
    '''Convert a string column into bit labels
    
    Each label will receive a unique bit encoding of 0's and 1's that
    Can identify that label. This helps trees and deep learning libraries
    narrow down to specific categories faster.'''
    col_name = cat_col.name
    cat_str = cat_col.copy().astype('str')
    cat_str = cat_str.fillna('Missing')
    cat_values = cat_str.unique()
    num_values = len(cat_values)
    cat_df = pd.DataFrame({col_name: cat_values, f'{col_name}_index': np.arange(num_values)})
    num_bits = round(np.ceil(np.log2(num_values)))
    bit_combos = it.product(*it.repeat([0, 1], num_bits))
    bit_df = pd.DataFrame(bit_combos, columns = [f'bit_{i}' for i in range(num_bits)])
    bit_df[f'{col_name}_index'] = np.arange(bit_df.shape[0])
    bit_df = pd.merge(cat_df, bit_df, on = f'{col_name}_index', how = 'inner')
    return bit_df

city_encoding_df = bitmap_encoding(snow_season_df['city'])
city_encoding_df.head()

Unnamed: 0,city,city_index,bit_0,bit_1,bit_2
0,aspen,0,0,0,0
1,boulder,1,0,0,1
2,coloradosprings,2,0,1,0
3,steamboat,3,0,1,1
4,breck,4,1,0,0


In [28]:
snow_matrix_df = (
    snow_season_df.
    pivot(columns = 'day_of_season', values = 'snow', index = ['year_season', 'city']).
    fillna(0).
    ## transform(np.log1p). save this for script
    reset_index().
    pipe(pd.merge, city_encoding_df, on = 'city', how = 'left')
)

x_cols = list(range(1, 62)) + list(filter(lambda x: 'city' not in x, city_encoding_df.columns))
y_cols = list(range(62, 152))

X = snow_matrix_df.loc[:, x_cols].to_numpy()
Y = snow_matrix_df.loc[:, y_cols].to_numpy()
print(X.shape)
print(Y.shape)
    

(168, 64)
(168, 90)


In [29]:
import pickle

with open('X_array.pkl', 'wb') as f:
    pickle.dump(X, f)
    
with open('Y_array.pkl', 'wb') as f:
    pickle.dump(Y, f)
    
snow_matrix_df.to_parquet('snow_matrix_df.parquet')
snow_season_df['temp_max'] = pd.to_numeric(snow_season_df['temp_max'], errors = 'coerce')
snow_season_df['temp_min'] = pd.to_numeric(snow_season_df['temp_min'], errors = 'coerce')
snow_season_df.to_parquet('snow_season_df.parquet')



### Functools

* **partial** accepts a function and returns a function with default args
* **reduce** accepts a function and an iterable and applies the function cumulatively to two arguments until it returns a single value. You can pass in an initializer argument as well. 

### itertools

* **accumulate** accepts an interable and a function and works through each result keeping a cumulative return
* **chain** accepts a list of iterables and returns a flattened chained working through each before moving on
* **product** accepts multiple lists and returns the cartesian product of the two
* **starmap** accepts a function and then a list of the arguments to successively apply to the function


In [27]:
snow_matrix_df.head()

Unnamed: 0,year_season,city,1,2,3,4,5,6,7,8,...,146,147,148,149,150,151,city_index,bit_0,bit_1,bit_2
0,2000,aspen,1.193922,1.280934,0.0,0.0,2.70805,1.252763,0.0,0.0,...,0.0,1.704748,0.0,0.788457,0.405465,0.00995,0,0,0,0
1,2000,boulder,0.0,0.0,0.641854,0.0,0.693147,0.00995,0.0,0.0,...,2.054124,0.0,0.0,0.0,0.00995,0.741937,1,0,0,1
2,2000,breck,1.609438,0.0,0.0,1.609438,0.0,1.252763,0.405465,0.0,...,0.693147,1.386294,0.00995,1.252763,0.0,0.916291,4,1,0,0
3,2000,coloradosprings,0.0,0.182322,0.405465,0.0,0.262364,0.00995,0.00995,0.0,...,0.09531,0.09531,0.0,0.336472,0.09531,0.00995,2,0,1,0
4,2000,denver,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5,1,0,1
