In [1016]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

## For each state/year of raw data, the 'state' and 'year' variables in the box of code below are the only lines of code that need to be changed for each run

In [1017]:
state = 'WA'
year = '2023'
inpath = 'F:/Libraries/My Documents/UM/Milestone II/Weather_' + state + '_' + year + '.csv'
outpath = 'F:/Libraries/My Documents/UM/Milestone II/Weather_' + state + '_' + year + '_cleaned2.csv'

In [1018]:
weather_df = pd.read_csv(inpath, low_memory=False)

In [1019]:
# remove that last blank row with the odd character in the station column
weather_df = weather_df.dropna()

In [1020]:
# drop the 'attributes' columns and other columns that we don't want
weather_df = weather_df.drop(['TEMP_ATTRIBUTES', 'DEWP_ATTRIBUTES', 'SLP_ATTRIBUTES', 'STP_ATTRIBUTES', 'VISIB_ATTRIBUTES', \
                              'WDSP_ATTRIBUTES', 'MAX_ATTRIBUTES', 'MIN_ATTRIBUTES', 'PRCP_ATTRIBUTES', \
                              'SLP', 'STP', 'FRSHTT'], axis=1)

In [1021]:
# remove duplicate header rows
weather_df = weather_df[weather_df['STATION'] != 'STATION']

In [1022]:
to_numeric_list = ['TEMP', 'DEWP', 'VISIB', 'WDSP', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP']
weather_df[to_numeric_list] = weather_df[to_numeric_list].apply(pd.to_numeric)

In [1023]:
# clean up dummy 9s values
weather_df.loc[weather_df['TEMP'] >= 999, 'TEMP'] = 50 # not sure this ever happens...
weather_df.loc[weather_df['WDSP'] >= 999, 'WDSP'] = 0
weather_df.loc[weather_df['MXSPD'] >= 999, 'MXSPD'] = 0
weather_df.loc[weather_df['GUST'] >= 999, 'GUST'] = 0
weather_df.loc[weather_df['PRCP'] >= 99, 'PRCP'] = 0
weather_df.loc[weather_df['SNDP'] >= 999, 'SNDP'] = 0
weather_df.loc[weather_df['DEWP'] >= 9999, 'DEWP'] = weather_df['TEMP'] - 15 # avg in relation to temp
weather_df.loc[weather_df['VISIB'] >= 999, 'VISIB'] = 9.3 # avg

In [1024]:
# if MAX temp is below average temp (must be a dummy value), set to average temp of the day
# if MIN temp is above average temp (must be a dummy value), set to average temp of the day
for i in weather_df.index:
    daily_avg_temp = weather_df['TEMP'][i]
    if weather_df['MAX'][i] < daily_avg_temp:
        weather_df['MAX'][i] = daily_avg_temp
    if weather_df['MAX'][i] >= 999:
        weather_df['MAX'][i] = daily_avg_temp
    if weather_df['MIN'][i] > daily_avg_temp:
        weather_df['MIN'][i] = daily_avg_temp

In [1025]:
# create new temp extremity measures
weather_df['TEMPEXT_BASE40'] = abs(weather_df['TEMP']-40)
weather_df['TEMPEXT_BASE45'] = abs(weather_df['TEMP']-45)
weather_df['TEMPEXT_BASE50'] = abs(weather_df['TEMP']-50)
    

In [1026]:
# create new temp bounds variables, default 0
weather_df['OVER_60'] = 0
weather_df['OVER_70'] = 0
weather_df['OVER_80'] = 0
weather_df['UNDER_40'] = 0
weather_df['UNDER_30'] = 0
weather_df['UNDER_20'] = 0
weather_df['SUM_OVER_UNDER'] = 0

# set new temp bounds variables
weather_df.loc[weather_df['TEMP'] >= 60, 'OVER_60'] = 1
weather_df.loc[weather_df['TEMP'] >= 70, 'OVER_70'] = 1
weather_df.loc[weather_df['TEMP'] >= 80, 'OVER_80'] = 1
weather_df.loc[weather_df['TEMP'] <= 40, 'UNDER_40'] = 1
weather_df.loc[weather_df['TEMP'] <= 30, 'UNDER_30'] = 1
weather_df.loc[weather_df['TEMP'] <= 20, 'UNDER_20'] = 1
weather_df['SUM_OVER_UNDER'] = weather_df['OVER_60'] + weather_df['OVER_70'] + weather_df['OVER_80'] + \
                               weather_df['UNDER_40'] + weather_df['UNDER_30'] + weather_df['UNDER_20']



In [1027]:
# add the state field, this will help for mapping
weather_df['STATE'] = state

In [1028]:
# write the output csv file
weather_df.to_csv(outpath, index=False)