# Streamline Blog Post Code Using Pandas

In [1]:
import pandas as pd
import numpy as np
import os

path = os.getcwd()+'\\'

## Access data

In [2]:
csvFile = 'alta-noaa-1980-2019.csv'
df_raw = pd.read_csv(path + csvFile,
                     parse_dates=['DATE'])

In [3]:
df_raw.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,DAPR,DASF,MDPR,MDSF,...,SNWD,TMAX,TMIN,TOBS,WT01,WT03,WT04,WT05,WT06,WT11
0,USC00420072,"ALTA, UT US",40.5905,-111.6369,2660.9,1980-01-01,,,,,...,29.0,38.0,25.0,25.0,,,,,,
1,USC00420072,"ALTA, UT US",40.5905,-111.6369,2660.9,1980-01-02,,,,,...,34.0,27.0,18.0,18.0,,,,,,
2,USC00420072,"ALTA, UT US",40.5905,-111.6369,2660.9,1980-01-03,,,,,...,30.0,27.0,12.0,18.0,,,,,,
3,USC00420072,"ALTA, UT US",40.5905,-111.6369,2660.9,1980-01-04,,,,,...,30.0,31.0,18.0,27.0,,,,,,
4,USC00420072,"ALTA, UT US",40.5905,-111.6369,2660.9,1980-01-05,,,,,...,30.0,34.0,26.0,34.0,,,,,,


In [4]:
df_raw.dtypes

STATION              object
NAME                 object
LATITUDE            float64
LONGITUDE           float64
ELEVATION           float64
DATE         datetime64[ns]
DAPR                float64
DASF                float64
MDPR                float64
MDSF                float64
PRCP                float64
SNOW                float64
SNWD                float64
TMAX                float64
TMIN                float64
TOBS                float64
WT01                float64
WT03                float64
WT04                float64
WT05                float64
WT06                float64
WT11                float64
dtype: object

## Step 1 - Drop, group and aggregate

In [5]:
dropColumns = ['WT01','WT05','MDPR','MDSF','WT03','DASF','WT04','DAPR','WT06','WT11']

df_final = (df_raw
            .drop(columns = dropColumns)
            .groupby(pd.Grouper(key = 'DATE', freq = 'W'))
            .agg({'PRCP': 'sum', 
                  'TMAX': 'max', 
                  'TMIN': 'min', 
                  'SNOW': 'sum', 
                  'SNWD': 'mean'})
)
df_final

Unnamed: 0_level_0,PRCP,TMAX,TMIN,SNOW,SNWD
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1980-01-06,0.68,42.0,12.0,7.0,30.500000
1980-01-13,8.76,33.0,-5.0,63.0,48.142857
1980-01-20,4.68,40.0,15.0,34.0,63.285714
1980-01-27,0.00,43.0,10.0,0.0,57.000000
1980-02-03,2.58,38.0,-6.0,40.0,70.571429
...,...,...,...,...,...
2019-08-11,1.09,78.0,42.0,0.0,0.000000
2019-08-18,0.00,75.0,42.0,0.0,0.000000
2019-08-25,0.00,76.0,45.0,0.0,0.000000
2019-09-01,0.02,78.0,41.0,0.0,0.000000


## Step 2 - Drop, group and aggregate, rename, reset index

In [6]:
dropColumns = ['WT01','WT05','MDPR','MDSF','WT03','DASF','WT04','DAPR','WT06','WT11']

df_final = (df_raw
            .drop(columns = dropColumns)
            .groupby(pd.Grouper(key = 'DATE', freq = 'W'))
            .agg({'PRCP': 'sum', ## total precip
                  'TMAX': 'max', 
                  'TMIN': 'min', 
                  'SNOW': 'sum', 
                  'SNWD': 'mean'})
            .rename(columns = {'PRCP':'PRCP_TOTAL', 
                               'SNOW':'SNOW_TOTAL',
                               'SNWD':'SWND_MEAN'})
            .reset_index()
)

df_final

Unnamed: 0,DATE,PRCP_TOTAL,TMAX,TMIN,SNOW_TOTAL,SWND_MEAN
0,1980-01-06,0.68,42.0,12.0,7.0,30.500000
1,1980-01-13,8.76,33.0,-5.0,63.0,48.142857
2,1980-01-20,4.68,40.0,15.0,34.0,63.285714
3,1980-01-27,0.00,43.0,10.0,0.0,57.000000
4,1980-02-03,2.58,38.0,-6.0,40.0,70.571429
...,...,...,...,...,...,...
2066,2019-08-11,1.09,78.0,42.0,0.0,0.000000
2067,2019-08-18,0.00,75.0,42.0,0.0,0.000000
2068,2019-08-25,0.00,76.0,45.0,0.0,0.000000
2069,2019-09-01,0.02,78.0,41.0,0.0,0.000000


## Step 3 - Drop, group and aggregate, rename, reset index, create columns

In [7]:
dropColumns = ['WT01','WT05','MDPR','MDSF','WT03','DASF','WT04','DAPR','WT06','WT11']

df_final = (df_raw
            
            ## Drop unncessary columns
            .drop(columns = dropColumns)
            
            ## Group by weekly dates
            .groupby(pd.Grouper(key = 'DATE', freq = 'W'))
            
            ## Aggregate the following columns by week
            .agg({'PRCP': 'sum',   ## total precip for the week
                  'TMAX': 'max',   ## max temp for the week
                  'TMIN': 'min',   ## min temp for the week
                  'SNOW': 'sum',   ## total snow for the week
                  'SNWD': 'mean'}) ## avg snowfall for the week
            
            ## Rename the new aggregated columns
            .rename(columns = {'PRCP':'PRCP_TOTAL', 
                               'SNOW':'SNOW_TOTAL',
                               'SNWD':'SWND_MEAN'})
            
            ## Reset the index
            .reset_index()
            
            ## Create the new columns
            .assign(
                ## year 2000 and prior it was called Alta, later renmaed to Alta ski resort
                LOCATION = np.select([df_final.DATE.dt.year <=2000, ## condition 1
                                      df_final.DATE.dt.year >2000], ## condition 2
                                     ['Alta','Alta Ski Resort']),
                # Find the temperate range
                T_RANGE = lambda _df: _df.TMAX - _df.TMIN,
            )
)

df_final

Unnamed: 0,DATE,PRCP_TOTAL,TMAX,TMIN,SNOW_TOTAL,SWND_MEAN,LOCATION,T_RANGE
0,1980-01-06,0.68,42.0,12.0,7.0,30.500000,Alta,30.0
1,1980-01-13,8.76,33.0,-5.0,63.0,48.142857,Alta,38.0
2,1980-01-20,4.68,40.0,15.0,34.0,63.285714,Alta,25.0
3,1980-01-27,0.00,43.0,10.0,0.0,57.000000,Alta,33.0
4,1980-02-03,2.58,38.0,-6.0,40.0,70.571429,Alta,44.0
...,...,...,...,...,...,...,...,...
2066,2019-08-11,1.09,78.0,42.0,0.0,0.000000,Alta Ski Resort,36.0
2067,2019-08-18,0.00,75.0,42.0,0.0,0.000000,Alta Ski Resort,33.0
2068,2019-08-25,0.00,76.0,45.0,0.0,0.000000,Alta Ski Resort,31.0
2069,2019-09-01,0.02,78.0,41.0,0.0,0.000000,Alta Ski Resort,37.0


## Step 4 - Drop, group and aggregate, rename, reset index, create columns(advanced)

In [64]:
dropColumns = ['WT01','WT05','MDPR','MDSF','WT03','DASF','WT04','DAPR','WT06','WT11']

df_final = (df_raw
            
            ## Drop unncessary columns
            .drop(columns = dropColumns)
            
            ## Group by weekly dates
            .groupby(pd.Grouper(key = 'DATE', freq = 'W'))
            
            ## Aggregate the following columns by week
            .agg({'PRCP': 'sum',   ## total precip for the week
                  'TMAX': 'max',   ## max temp for the week
                  'TMIN': 'min',   ## min temp for the week
                  'SNOW': 'sum',   ## total snow for the week
                  'SNWD': 'mean'}) ## avg snowfall for the week
            
            ## Rename the new aggregated columns
            .rename(columns = {'PRCP':'PRCP_TOTAL', 
                               'SNOW':'SNOW_TOTAL',
                               'SNWD':'SWND_MEAN'})
            
            ## Reset the index
            .reset_index()
            
            ## Create the new columns
            .assign(
                
                # Find the temperate range
                T_RANGE = lambda _df: _df.TMAX - _df.TMIN,
                
                ## Store the year
                YEAR = lambda _df: _df.DATE.dt.year,
                
                LOCATION = 'Alta',
                
                ## Find the season. If the month is between  is 5 (May) and 10 (October) then it's the summer season. Otherwise it's the winter season
                SEASON_VALUE = lambda _df: np.select([(_df.DATE.dt.month >= 5) & (_df.DATE.dt.month <10)], 
                                                      ['Summer'],  ## If true
                                                      'Winter'),   ## Else
                
                ## Create a string that indicates the season and year. 
                ## If it's winter season (less than 4 but the start of the ski season is the year before), subtract a year for the correct ski season
                SEASON = lambda _df: np.select([_df.DATE.dt.month < 4],
                                               ['Alta Ski Resort ' + _df.DATE.dt.year.subtract(1).astype('str')],
                                               'Summer ' + _df.DATE.dt.year.astype('str'))
            )
)

df_final

Unnamed: 0,DATE,PRCP_TOTAL,TMAX,TMIN,SNOW_TOTAL,SWND_MEAN,T_RANGE,YEAR,LOCATION,SEASON_VALUE,SEASON
0,1980-01-06,0.68,42.0,12.0,7.0,30.500000,30.0,1980,Alta,Winter,Alta Ski Resort 1979
1,1980-01-13,8.76,33.0,-5.0,63.0,48.142857,38.0,1980,Alta,Winter,Alta Ski Resort 1979
2,1980-01-20,4.68,40.0,15.0,34.0,63.285714,25.0,1980,Alta,Winter,Alta Ski Resort 1979
3,1980-01-27,0.00,43.0,10.0,0.0,57.000000,33.0,1980,Alta,Winter,Alta Ski Resort 1979
4,1980-02-03,2.58,38.0,-6.0,40.0,70.571429,44.0,1980,Alta,Winter,Alta Ski Resort 1979
...,...,...,...,...,...,...,...,...,...,...,...
2066,2019-08-11,1.09,78.0,42.0,0.0,0.000000,36.0,2019,Alta,Summer,Summer 2019
2067,2019-08-18,0.00,75.0,42.0,0.0,0.000000,33.0,2019,Alta,Summer,Summer 2019
2068,2019-08-25,0.00,76.0,45.0,0.0,0.000000,31.0,2019,Alta,Summer,Summer 2019
2069,2019-09-01,0.02,78.0,41.0,0.0,0.000000,37.0,2019,Alta,Summer,Summer 2019


## Step 5 - Drop, group and aggregate, rename, reset index, create columns(using user functions)

In [75]:
def monthly_summary_df(df_raw):
    
    ##
    ## Columns to drop
    ##
    dropColumns = ['WT01','WT05','MDPR','MDSF','WT03','DASF','WT04','DAPR','WT06','WT11']


    ##
    ## Functions to create columns
    ##

    def get_t_range(_df):
        return _df.TMAX - _df.TMIN


    def get_year(_df):
        return _df.DATE.dt.year

    def get_season_value(_df):
        # If the month is between  is 5 (May) and 10 (October) then it's the summer season. Otherwise it's the winter season
        return np.select([(_df.DATE.dt.month >= 5) & (_df.DATE.dt.month <10)], 
                         ['Summer'], ## If true
                         'Winter')   ## else return

    def get_season(_df):
        ## If it's winter season (less than 4 but the start of the ski season is the year before), subtract a year for the correct ski season
        return np.select([_df.DATE.dt.month < 4], 
                         ['Alta Ski Resort ' + _df.DATE.dt.year.subtract(1).astype('str')],      ## If true
                         'Summer ' + _df.DATE.dt.year.astype('str'))                             ## else


    return (df_raw

           ## Drop unncessary columns
           .drop(columns = dropColumns)
 
            ## Group by weekly dates
            .groupby(pd.Grouper(key = 'DATE', freq = 'W'))

            ## Aggregate the following columns by week
            .agg({'PRCP': 'sum',   ## total precip for the week
                  'TMAX': 'max',   ## max temp for the week
                  'TMIN': 'min',   ## min temp for the week
                  'SNOW': 'sum',   ## total snow for the week
                  'SNWD': 'mean'}) ## avg snowfall for the week

                ## Rename the new aggregated columns
                .rename(columns = {'PRCP':'PRCP_TOTAL', 
                                   'SNOW':'SNOW_TOTAL',
                                   'SNWD':'SWND_MEAN'})

                ## Reset the index
                .reset_index()

                ## Create the new columns
                .assign(

                    # Find the temperate range
                    T_RANGE = get_t_range,

                    ## Store the year
                    YEAR = get_year,

                    LOCATION = 'Alta',

                    # ## Find the season category (winter/summer)
                    SEASON_VALUE = get_season_value,

                    ## Create a string that indicates the season and year
                    SEASON = get_season
                )
    )

csvFile = 'alta-noaa-1980-2019.csv'
df_final = monthly_summary_df(pd.read_csv(path + csvFile, parse_dates=['DATE']))

df_final

Unnamed: 0,DATE,PRCP_TOTAL,TMAX,TMIN,SNOW_TOTAL,SWND_MEAN,T_RANGE,YEAR,LOCATION,SEASON_VALUE,SEASON
0,1980-01-06,0.68,42.0,12.0,7.0,30.500000,30.0,1980,Alta,Winter,Alta Ski Resort 1979
1,1980-01-13,8.76,33.0,-5.0,63.0,48.142857,38.0,1980,Alta,Winter,Alta Ski Resort 1979
2,1980-01-20,4.68,40.0,15.0,34.0,63.285714,25.0,1980,Alta,Winter,Alta Ski Resort 1979
3,1980-01-27,0.00,43.0,10.0,0.0,57.000000,33.0,1980,Alta,Winter,Alta Ski Resort 1979
4,1980-02-03,2.58,38.0,-6.0,40.0,70.571429,44.0,1980,Alta,Winter,Alta Ski Resort 1979
...,...,...,...,...,...,...,...,...,...,...,...
2066,2019-08-11,1.09,78.0,42.0,0.0,0.000000,36.0,2019,Alta,Summer,Summer 2019
2067,2019-08-18,0.00,75.0,42.0,0.0,0.000000,33.0,2019,Alta,Summer,Summer 2019
2068,2019-08-25,0.00,76.0,45.0,0.0,0.000000,31.0,2019,Alta,Summer,Summer 2019
2069,2019-09-01,0.02,78.0,41.0,0.0,0.000000,37.0,2019,Alta,Summer,Summer 2019


## Step 6 - Visualize data

In [77]:
df_final.head()

Unnamed: 0,DATE,PRCP_TOTAL,TMAX,TMIN,SNOW_TOTAL,SWND_MEAN,T_RANGE,YEAR,LOCATION,SEASON_VALUE,SEASON
0,1980-01-06,0.68,42.0,12.0,7.0,30.5,30.0,1980,Alta,Winter,Alta Ski Resort 1979
1,1980-01-13,8.76,33.0,-5.0,63.0,48.142857,38.0,1980,Alta,Winter,Alta Ski Resort 1979
2,1980-01-20,4.68,40.0,15.0,34.0,63.285714,25.0,1980,Alta,Winter,Alta Ski Resort 1979
3,1980-01-27,0.0,43.0,10.0,0.0,57.0,33.0,1980,Alta,Winter,Alta Ski Resort 1979
4,1980-02-03,2.58,38.0,-6.0,40.0,70.571429,44.0,1980,Alta,Winter,Alta Ski Resort 1979
