### Initializing

In [1]:
import pandas as pd
import os 
import functions

In [2]:
# test for proper access to functions module
functions.temp()

hello


In [3]:
# define path variable for data folder containing concatenated .csv files
path_2 = '../data'

### Data cleaning
Objectives:
- create separate files for wind and solar data from each weather station network
- keep only columns associated with wind power generation
    - wind speed
    - [air density
        - air pressure
        - air temperature
        - relative humidity] -> *These variables were initially included in the interest of representing wind energy potential more accurately, but were later omitted due to inconsistent availability within the dataset. A standard value for air density (1.225 kg/m^3) was used in wind energy calculations.*
- keep only columns associated with solar power generation
    - solar radiation
- standardize column data with common variables (units, format)
    - convert wind speeds measured in km/h to m/s
- remove data with observations outside of a reasonable range 
    - wind speed > 65 m/s
        - highest recorded wind speed in Canada is 233 km/h (64.7222 m/s)
    - wind speed < 0 m/s
        - wind speed cannot be negative
    - solar radiation > 950 W/m^2
        - solar radiation before passing through the atmosphere is ~1360 W/m^2
        - midlatitude regions typically receive ~70% of solar energy compared to the equator (https://earthobservatory.nasa.gov/features/EnergyBalance/page2.php#:~:text=At%20Earth's%20average%20distance%20from,most%20recent%20NASA%20satellite%20missions.)
    - solar radiation < 0 W/m^2
        - solar radiation cannot be negative
- replace "-" with "_" in network_id column values

#### BCH Data Cleaning

In [4]:
BCH_data_clean = functions.to_df('BCH_data_clean.csv')
functions.list_columns(BCH_data_clean)

['windspeed',
 'time',
 'winddirection',
 'station_id',
 'network_id',
 'l_down_corr_avg',
 'hft3_1_avg',
 'l_up_avg',
 'one_day_snow',
 'k_up_avg',
 'l_up_corr_avg',
 'airtemp',
 'l_down_avg',
 'hft3_2_avg',
 'k_down_avg',
 'barometricpressure',
 'soilvolumetricwatercontent',
 'relativehumidity',
 'tsoil_avg',
 'vis',
 'one_day_rain',
 'one_day_precipitation',
 'min_temp',
 'max_temp',
 'snow_we',
 'snow_on_the_ground',
 'netrad']

In [5]:
# only include variables of interest
BCH_data_wind = BCH_data_clean.loc[:,('time', 'windspeed', 'station_id', 'network_id')]
BCH_data_solar = BCH_data_clean.loc[:,('time', 'netrad', 'station_id', 'network_id')]

##### BCH Wind Data Cleaning

In [6]:
# count null values in wind data
BCH_data_wind.isna().sum()

time                0
windspeed     6797851
station_id          0
network_id          0
dtype: int64

In [13]:
# drop rows in wind data that do not have a wind speed value
# wind speed is a key variable that is critical for location-specific energy calculations
BCH_data_wind_clean = BCH_data_wind.dropna(subset=['windspeed'])
BCH_data_wind_clean.isna().sum()

time          0
windspeed     0
station_id    0
network_id    0
dtype: int64

In [14]:
# check for duplicates
duplicate_rows = BCH_data_wind_clean.duplicated()
duplicate_rows[duplicate_rows].index

Index([], dtype='int64')

In [None]:
# check windspeed units
# BCH_variables.csv records state that wind speed is recorded in m/s

In [15]:
# remove values outside of reasonable range
BCH_data_wind_clean = BCH_data_wind_clean[BCH_data_wind_clean['windspeed'] < 65]
BCH_data_wind_clean = BCH_data_wind_clean[BCH_data_wind_clean['windspeed'] >= 0]

In [16]:
# rename columns
BCH_wind = BCH_data_wind_clean.rename(columns={"windspeed": "wind_speed"})
BCH_wind

Unnamed: 0,time,wind_speed,station_id,network_id
0,2017-01-01 01:00:00,5.7,85A,BCH
1,2017-01-01 02:00:00,5.9,85A,BCH
2,2017-01-01 03:00:00,5.8,85A,BCH
3,2017-01-01 04:00:00,4.7,85A,BCH
4,2017-01-01 05:00:00,4.8,85A,BCH
...,...,...,...,...
6280365,2020-06-30 20:00:00,1.6,TEC,BCH
6280366,2020-06-30 21:00:00,1.7,TEC,BCH
6280367,2020-06-30 22:00:00,0.8,TEC,BCH
6280368,2020-06-30 23:00:00,1.7,TEC,BCH


In [None]:
# replace "-" with "_" in station_id values if necessary
# not necessary here

In [17]:
# save to csv
BCH_wind.to_csv('../data/BCH_wind.csv', index=False)

##### BCH Solar Data Cleaning

In [18]:
# count null values in solar data
BCH_data_solar.isna().sum()

time                0
netrad        6928912
station_id          0
network_id          0
dtype: int64

In [19]:
# drop rows in solar data that do not have a NetRad value
BCH_data_solar_clean = BCH_data_solar.dropna(subset=['netrad'])
BCH_data_solar_clean.isna().sum()

time          0
netrad        0
station_id    0
network_id    0
dtype: int64

In [None]:
# check for duplicates
duplicate_rows = BCH_data_solar_clean.duplicated()
duplicate_rows[duplicate_rows].index

Index([], dtype='int64')

In [20]:
# remove values outside reasonable range
BCH_data_solar_clean = BCH_data_solar_clean[BCH_data_solar_clean['netrad'] < 950]
BCH_data_solar_clean = BCH_data_solar_clean[BCH_data_solar_clean['netrad'] >= 0]

In [21]:
# rename columns
BCH_solar = BCH_data_solar_clean.rename(columns={"netrad": "solar_radiation"})
BCH_solar

Unnamed: 0,time,solar_radiation,station_id,network_id
495016,2017-09-29 13:00:00,275.2,ATP,BCH
495017,2017-09-29 14:00:00,222.4,ATP,BCH
495018,2017-09-29 15:00:00,86.3,ATP,BCH
495019,2017-09-29 16:00:00,14.0,ATP,BCH
495020,2017-09-29 17:00:00,1.7,ATP,BCH
...,...,...,...,...
6258056,2020-06-30 15:00:00,296.5,TAB,BCH
6258057,2020-06-30 16:00:00,66.9,TAB,BCH
6258058,2020-06-30 17:00:00,138.1,TAB,BCH
6258059,2020-06-30 18:00:00,31.1,TAB,BCH


In [22]:
# save BCH solar data to csv
BCH_solar.to_csv('../data/BCH_solar.csv', index=False)

#### CRD Data Cleaning

In [23]:
CRD_data = functions.to_df('CRD_data_clean.csv')

In [24]:
# condense intial steps of identifying column names, number of null values, and any duplicate rows into one function
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(CRD_data)

In [25]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
CRD_data.shape

column names: ['solarradiation', 'airtemperature', 'precipitation', 'rain', 'windspeed', 'snowdepth', 'time', 'winddirection', 'relativehumidity', 'station_id', 'network_id', 'snowwaterequivalent']
column nulls: solarradiation         4090779
airtemperature          180523
precipitation          3328235
rain                    438757
windspeed               655130
snowdepth              3711426
time                         0
winddirection           658523
relativehumidity        591010
station_id                   0
network_id                   0
snowwaterequivalent    4078683
dtype: int64
duplicate rows: Index([], dtype='int64')


(4100571, 12)

In [26]:
# only keep variables of interest
CRD_data_solar = CRD_data.loc[:,('time', 'solarradiation', 'station_id', 'network_id')]
CRD_data_wind = CRD_data.loc[:,('time', 'windspeed', 'station_id', 'network_id')]

In [27]:
# drop rows in solar data that do not have solar radiation values
CRD_data_solar_clean = CRD_data_solar.dropna(subset=['solarradiation'])
CRD_data_solar_clean.isna().sum()

time              0
solarradiation    0
station_id        0
network_id        0
dtype: int64

In [28]:
# remove values outside reasonable range
CRD_data_solar_clean = CRD_data_solar_clean[CRD_data_solar_clean['solarradiation'] < 950]
CRD_data_solar_clean = CRD_data_solar_clean[CRD_data_solar_clean['solarradiation'] >= 0]

In [29]:
# rename columns
CRD_solar = CRD_data_solar_clean.rename(columns={"solarradiation": "solar_radiation"})

In [None]:
# rename station_id values if necessary
# not necessary here

In [30]:
# save to csv
CRD_solar.to_csv(f'{path_2}/CRD_solar.csv', index=False)

In [31]:
# count null values in wind data
CRD_data_wind.isna().sum()

time               0
windspeed     655130
station_id         0
network_id         0
dtype: int64

In [32]:
# drop rows in wind data that do not have wind speed values
CRD_data_wind_clean = CRD_data_wind.dropna(subset=['windspeed'])
CRD_data_wind_clean.isna().sum()

time          0
windspeed     0
station_id    0
network_id    0
dtype: int64

In [33]:
# check wind speed values, units
# CRD station variables state values are recorded in km/h 
CRD_data_wind_clean.head(15)

Unnamed: 0,time,windspeed,station_id,network_id
0,1998-04-17 00:00:00,0.0,FW001,CRD
1,1998-04-17 01:00:00,0.0,FW001,CRD
2,1998-04-17 02:00:00,0.0,FW001,CRD
3,1998-04-17 03:00:00,0.0,FW001,CRD
4,1998-04-17 04:00:00,0.0,FW001,CRD
5,1998-04-17 05:00:00,0.0,FW001,CRD
6,1998-04-17 06:00:00,0.0,FW001,CRD
7,1998-04-17 07:00:00,0.0,FW001,CRD
8,1998-04-17 08:00:00,0.0,FW001,CRD
9,1998-04-17 09:00:00,3.8,FW001,CRD


In [34]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
CRD_data_wind_clean.loc[:,'windspeed'] = CRD_data_wind_clean.loc[:,'windspeed'].div(3.6)
CRD_data_wind_clean.head(15)

Unnamed: 0,time,windspeed,station_id,network_id
0,1998-04-17 00:00:00,0.0,FW001,CRD
1,1998-04-17 01:00:00,0.0,FW001,CRD
2,1998-04-17 02:00:00,0.0,FW001,CRD
3,1998-04-17 03:00:00,0.0,FW001,CRD
4,1998-04-17 04:00:00,0.0,FW001,CRD
5,1998-04-17 05:00:00,0.0,FW001,CRD
6,1998-04-17 06:00:00,0.0,FW001,CRD
7,1998-04-17 07:00:00,0.0,FW001,CRD
8,1998-04-17 08:00:00,0.0,FW001,CRD
9,1998-04-17 09:00:00,1.055556,FW001,CRD


In [35]:
# remove values outside of reasonable range
CRD_data_wind_clean = CRD_data_wind_clean[CRD_data_wind_clean['windspeed'] < 65]
CRD_data_wind_clean = CRD_data_wind_clean[CRD_data_wind_clean['windspeed'] >= 0]

In [36]:
# rename columns
CRD_wind = CRD_data_wind_clean.rename(columns={"windspeed":"wind_speed"})

In [37]:
# save to csv
CRD_wind.to_csv(f'{path_2}/CRD_wind.csv', index=False)

#### EC_raw Data Cleaning

In [38]:
EC_raw_data = functions.to_df('EC_raw_data_clean.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv')


In [39]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(EC_raw_data)

In [40]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['wind_speed', 'mean_sea_level', 'dew_point', 'wind_direction', 'relative_humidity', 'air_temperature', 'time', 'air_temperature_yesterday_low', 'wind_gust_speed', 'air_temperature_yesterday_high', 'tendency_amount', 'station_id', 'network_id', 'total_precipitation', 'snow_amount', 'total_cloud_cover']
column nulls: wind_speed                         1012435
mean_sea_level                     1326710
dew_point                           932842
wind_direction                     1834304
relative_humidity                   935655
air_temperature                     411231
time                                     0
air_temperature_yesterday_low     12391030
wind_gust_speed                   11440570
air_temperature_yesterday_high    12390760
tendency_amount                    2839650
station_id                               0
network_id                               0
total_precipitation               12295547
snow_amount                       12736774
total_cloud_cover      

In [41]:
# only keep variables of interest
# no solar data available from this network
EC_raw_data_wind = EC_raw_data.loc[:,('time', 'wind_speed', 'station_id', 'network_id')]

In [42]:
# count null values in wind data
EC_raw_data_wind.isna().sum()

time                0
wind_speed    1012435
station_id          0
network_id          0
dtype: int64

In [43]:
# drop rows in wind data that do not have values in windspeed column
EC_raw_data_wind_clean = EC_raw_data_wind.dropna(subset=['wind_speed'])
EC_raw_data_wind_clean.isna().sum()

time          0
wind_speed    0
station_id    0
network_id    0
dtype: int64

In [44]:
# check windspeed units
EC_raw_data_wind_clean.head(15)
# station metadata states km/h

Unnamed: 0,time,wind_speed,station_id,network_id
138,2011-10-16 00:00:00,16.9,1012475,EC_raw
139,2011-10-16 01:00:00,16.2,1012475,EC_raw
140,2011-10-16 02:00:00,10.4,1012475,EC_raw
141,2011-10-16 03:00:00,9.4,1012475,EC_raw
142,2011-10-16 04:00:00,13.3,1012475,EC_raw
143,2011-10-16 05:00:00,15.5,1012475,EC_raw
144,2011-10-16 06:00:00,17.3,1012475,EC_raw
145,2011-10-16 07:00:00,16.9,1012475,EC_raw
146,2011-10-16 08:00:00,16.2,1012475,EC_raw
147,2011-10-16 09:00:00,8.3,1012475,EC_raw


In [45]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
EC_raw_data_wind_clean.loc[:,('wind_speed')] = EC_raw_data_wind_clean.loc[:,('wind_speed')].div(3.6)
EC_raw_data_wind_clean.head(15)

Unnamed: 0,time,wind_speed,station_id,network_id
138,2011-10-16 00:00:00,4.694444,1012475,EC_raw
139,2011-10-16 01:00:00,4.5,1012475,EC_raw
140,2011-10-16 02:00:00,2.888889,1012475,EC_raw
141,2011-10-16 03:00:00,2.611111,1012475,EC_raw
142,2011-10-16 04:00:00,3.694444,1012475,EC_raw
143,2011-10-16 05:00:00,4.305556,1012475,EC_raw
144,2011-10-16 06:00:00,4.805556,1012475,EC_raw
145,2011-10-16 07:00:00,4.694444,1012475,EC_raw
146,2011-10-16 08:00:00,4.5,1012475,EC_raw
147,2011-10-16 09:00:00,2.305556,1012475,EC_raw


In [46]:
# remove values outside reasonable range
EC_raw_data_wind_clean = EC_raw_data_wind_clean[EC_raw_data_wind_clean['wind_speed'] < 65]
EC_raw_data_wind_clean = EC_raw_data_wind_clean[EC_raw_data_wind_clean['wind_speed'] >= 0]

In [None]:
# rename columns
# not required

In [None]:
# replace station_id values if necessary
# not necessary

In [47]:
# save to csv
EC_raw_data_wind_clean.to_csv(f'{path_2}/EC_raw_wind.csv', index=False)

#### ENV_AQN Data Cleaning

In [48]:
ENV_AQN_data = functions.to_df('ENV_AQN_data_clean.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv')


In [49]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(ENV_AQN_data)

In [50]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['wdir_vect', 'temp_mean', 'wspd_sclr', 'precip_total', 'avg_rel_hum_pst1hr', 'time', 'humidity', 'avg_air_temp_pst1hr', 'station_id', 'network_id', 'bar_press']
column nulls: wdir_vect               6568691
temp_mean               5413225
wspd_sclr               2296217
precip_total           12783840
avg_rel_hum_pst1hr      8968906
time                          0
humidity                8688631
avg_air_temp_pst1hr     7985418
station_id                    0
network_id                    0
bar_press              12962966
dtype: int64
duplicate rows: Index([], dtype='int64')


In [51]:
# only keep variables of interest
# no solar data available for this network
ENV_AQN_data_wind = ENV_AQN_data.loc[:,('time', 'wspd_sclr', 'station_id', 'network_id')] 

In [52]:
# count null values in wind data
ENV_AQN_data_wind.isna().sum()

time                0
wspd_sclr     2296217
station_id          0
network_id          0
dtype: int64

In [53]:
# drop rows in wind data that do not have windspeed values
ENV_AQN_data_wind_clean = ENV_AQN_data_wind.dropna(subset=['wspd_sclr'])
ENV_AQN_data_wind_clean.isna().sum()

time          0
wspd_sclr     0
station_id    0
network_id    0
dtype: int64

In [54]:
# check windspeed values, units
# no units specified in ENV_AQN station data file
ENV_AQN_data_wind_clean.head(15)

Unnamed: 0,time,wspd_sclr,station_id,network_id
0,1998-03-12 15:00:00,1.044,110031,ENV-AQN
1,1998-03-12 16:00:00,2.518,110031,ENV-AQN
2,1998-03-12 17:00:00,1.028,110031,ENV-AQN
3,1998-03-12 18:00:00,0.704,110031,ENV-AQN
4,1998-03-12 19:00:00,0.208,110031,ENV-AQN
5,1998-03-12 20:00:00,0.14,110031,ENV-AQN
6,1998-03-12 21:00:00,0.534,110031,ENV-AQN
7,1998-03-12 22:00:00,0.457,110031,ENV-AQN
8,1998-03-12 23:00:00,0.916,110031,ENV-AQN
9,1998-03-13 00:00:00,0.303,110031,ENV-AQN


In [55]:
# check windspeed max values
ENV_AQN_data_wind_clean.nlargest(50, 'wspd_sclr')


Unnamed: 0,time,wspd_sclr,station_id,network_id
5579171,2009-01-03 00:00:00,299.872,E257415,ENV-AQN
5579165,2009-01-02 18:00:00,299.676,E257415,ENV-AQN
5579168,2009-01-02 21:00:00,299.667,E257415,ENV-AQN
5579169,2009-01-02 22:00:00,299.666,E257415,ENV-AQN
5579170,2009-01-02 23:00:00,299.666,E257415,ENV-AQN
5579167,2009-01-02 20:00:00,299.66,E257415,ENV-AQN
5579166,2009-01-02 19:00:00,299.659,E257415,ENV-AQN
5579151,2009-01-02 04:00:00,299.542,E257415,ENV-AQN
5580311,2009-02-19 12:00:00,299.351,E257415,ENV-AQN
5580052,2009-02-08 17:00:00,298.838,E257415,ENV-AQN


In [56]:
# check windspeed median value
ENV_AQN_data_wind_clean.median(numeric_only=True)

wspd_sclr    1.84
dtype: float64

In [None]:
# based on median value, assumed units were m/s
# assumed that max values were errors/outliers

In [58]:
# remove values outside of reasonable range
ENV_AQN_data_wind_clean = ENV_AQN_data_wind_clean[ENV_AQN_data_wind_clean['wspd_sclr'] < 65]
ENV_AQN_data_wind_clean = ENV_AQN_data_wind_clean[ENV_AQN_data_wind_clean['wspd_sclr'] >= 0]

In [59]:
# rename columns
ENV_AQN_wind = ENV_AQN_data_wind_clean.rename(columns={"wspd_sclr": "wind_speed"})

In [60]:
# replace "-" in network_id values with "_"
ENV_AQN_wind.loc[:,("network_id")] = ENV_AQN_wind.loc[:,("network_id")].replace("-","_", regex=True)
# reset index
ENV_AQN_wind = ENV_AQN_wind.reset_index(drop=True)
ENV_AQN_wind

Unnamed: 0,time,wind_speed,station_id,network_id
0,1998-03-12 15:00:00,1.044,110031,ENV_AQN
1,1998-03-12 16:00:00,2.518,110031,ENV_AQN
2,1998-03-12 17:00:00,1.028,110031,ENV_AQN
3,1998-03-12 18:00:00,0.704,110031,ENV_AQN
4,1998-03-12 19:00:00,0.208,110031,ENV_AQN
...,...,...,...,...
10772977,1999-10-31 19:00:00,10.490,M120001,ENV_AQN
10772978,1999-10-31 20:00:00,9.610,M120001,ENV_AQN
10772979,1999-10-31 21:00:00,13.410,M120001,ENV_AQN
10772980,1999-10-31 22:00:00,11.520,M120001,ENV_AQN


In [61]:
# save to csv
ENV_AQN_wind.to_csv(f'{path_2}/ENV_AQN_wind.csv', index=False)

#### FLNRO_FERN Data Cleaning

In [62]:
FLNRO_FERN_data = functions.to_df('FLNRO_FERN_data.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv')


In [63]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(FLNRO_FERN_data)

In [64]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['tempc', 'solarradiationwm', 'rh', 'windspeedms', 'pressurembar', 'rainmm', 'gustspeedms', 'time', 'winddirection', 'dewptc', 'station_id', 'network_id', 'wetness', 'tx', 'tn', 'rhx', 'tm', 'rhn']
column nulls: tempc                 78077
solarradiationwm     155467
rh                    76028
windspeedms          159515
pressurembar         143098
rainmm               163976
gustspeedms          171360
time                      0
winddirection        398990
dewptc               187998
station_id                0
network_id                0
wetness             1441326
tx                  2310484
tn                  2310515
rhx                 2331262
tm                  2319798
rhn                 2331284
dtype: int64
duplicate rows: Index([], dtype='int64')


In [65]:
# only keep variables of interest
FLNRO_FERN_data_solar = FLNRO_FERN_data.loc[:,('time', 'solarradiationwm', 'station_id', 'network_id')]
FLNRO_FERN_data_wind = FLNRO_FERN_data.loc[:,('time', 'windspeedms', 'station_id', 'network_id')]

In [66]:
# drop rows in solar data that do not have solar radiation values
FLNRO_FERN_data_solar_clean = FLNRO_FERN_data_solar.dropna(subset=['solarradiationwm'])
FLNRO_FERN_data_solar_clean.isna().sum()

time                0
solarradiationwm    0
station_id          0
network_id          0
dtype: int64

In [67]:
# remove values outside reasonable range
FLNRO_FERN_data_solar_clean = FLNRO_FERN_data_solar_clean[FLNRO_FERN_data_solar_clean['solarradiationwm'] < 950]
FLNRO_FERN_data_solar_clean = FLNRO_FERN_data_solar_clean[FLNRO_FERN_data_solar_clean['solarradiationwm'] >= 0]

In [68]:
# rename columns
FLNRO_FERN_solar = FLNRO_FERN_data_solar_clean.rename(columns={"solarradiationwm": "solar_radiation"})

In [69]:
# replace "-" in network_id values with "_"
FLNRO_FERN_solar.loc[:,("network_id")] = FLNRO_FERN_solar.loc[:,("network_id")].replace("-","_", regex=True)
# reset index
FLNRO_FERN_solar = FLNRO_FERN_solar.reset_index(drop=True)
FLNRO_FERN_solar

Unnamed: 0,time,solar_radiation,station_id,network_id
0,2007-07-31 15:00:00,703.1,1095439,FLNRO_FERN
1,2007-07-31 16:00:00,298.1,1095439,FLNRO_FERN
2,2007-07-31 17:00:00,125.6,1095439,FLNRO_FERN
3,2007-07-31 18:00:00,114.4,1095439,FLNRO_FERN
4,2007-07-31 19:00:00,20.6,1095439,FLNRO_FERN
...,...,...,...,...
2165045,2018-10-31 07:00:00,3.1,McBridePeak,FLNRO_FERN
2165046,2018-10-31 08:00:00,29.4,McBridePeak,FLNRO_FERN
2165047,2018-10-31 09:00:00,54.4,McBridePeak,FLNRO_FERN
2165048,2018-10-31 10:00:00,100.6,McBridePeak,FLNRO_FERN


In [70]:
FLNRO_FERN_solar.to_csv(f'{path_2}/FLNRO_FERN_solar.csv', index=False)

In [71]:
# count null values in wind data
FLNRO_FERN_data_wind.isna().sum()

time                0
windspeedms    159515
station_id          0
network_id          0
dtype: int64

In [72]:
# drop rows in wind data that do not have WindSpeed values
FLNRO_FERN_data_wind_clean = FLNRO_FERN_data_wind.dropna(subset=['windspeedms'])
FLNRO_FERN_data_wind_clean.isna().sum()

time           0
windspeedms    0
station_id     0
network_id     0
dtype: int64

In [None]:
# check windspeed units
# station data file states units are m/s
# no conversion necessary

In [74]:
# remove values outside reasonable range
FLNRO_FERN_data_wind_clean = FLNRO_FERN_data_wind_clean[FLNRO_FERN_data_wind_clean['windspeedms'] < 65]
FLNRO_FERN_data_wind_clean = FLNRO_FERN_data_wind_clean[FLNRO_FERN_data_wind_clean['windspeedms'] >= 0]

In [77]:
# rename columns
FLNRO_FERN_wind = FLNRO_FERN_data_wind_clean.rename(columns={"windspeedms": "wind_speed"})

In [None]:
# replace "-" in network_id values with "_"
FLNRO_FERN_wind.loc[:,("network_id")] = FLNRO_FERN_wind.loc[:,("network_id")].replace("-","_", regex=True)
# reset index
FLNRO_FERN_wind = FLNRO_FERN_wind.reset_index(drop=True)
FLNRO_FERN_wind

In [78]:
# save to csv
FLNRO_FERN_wind.to_csv(f'{path_2}/FLNRO_FERN_wind.csv', index=False)

#### FLNRO_WMB Data Cleaning

*Note: memory error occurred when trying to import data. Will import directly to pgAdmin and perform cleaning steps there*

#### MoTIe Data Cleaning

In [79]:
MoTIe_col_names = pd.read_csv(f'{path_2}/MoTIe_data_clean.csv', index_col=0, nrows=0)

In [80]:
list(MoTIe_col_names)

['standard_snow',
 'minimum_air_temperature',
 'max_wnd_spd_10m_pst1hr',
 'measured_wind_speed1',
 'measured_wind_direction1',
 'dwpt_temp',
 'min_air_temp_snc_last_reset',
 'dew_point',
 'atmospheric_pressure',
 'avg_wnd_dir_10m_pst10mts',
 'actual_wind_direction',
 'maximum_air_temperature',
 'snw_dpth',
 'maximum_measured_wind_speed1',
 'height_of_snow',
 'pcpn_amt_pst24hrs',
 'current_air_temperature2',
 'current_air_temperature1',
 'precipitation_new',
 'max_air_temp_snc_last_reset',
 'precip_detector_ratio',
 'actual_wind_speed',
 'wind_direction_std_deviation1',
 'avg_wnd_spd_10m_pst10mts',
 'air_temp',
 'mslp',
 'pcpn_amt_pst1hr',
 'snwfl_amt_pst1hr',
 'rel_hum',
 'time',
 'stn_pres',
 'station_id',
 'network_id']

In [None]:
# # Note: MemoryError/kernel crashed when attempting to import entire csv at once. Only import columns of interest.
# MoTIe_data = functions.to_df('MoTIe_data_clean.csv')

In [81]:
# limit which columns are included in the read_csv function in order to combat memory error
MoTIe_data = pd.read_csv(f'{path_2}/MoTIe_data_clean.csv', usecols=['time', 'actual_wind_speed', 'measured_wind_speed1', 'station_id', 'network_id'])

In [82]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MoTIe_data)

In [83]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MoTIe_data.shape

column names: ['measured_wind_speed1', 'actual_wind_speed', 'time', 'station_id', 'network_id']
column nulls: measured_wind_speed1     7597565
actual_wind_speed       17764947
time                           0
station_id                     0
network_id                     0
dtype: int64
duplicate rows: Index([], dtype='int64')


(23833427, 5)

In [84]:
# drop rows in wind data that do not have any wind speed values
MoTIe_data_wind = MoTIe_data.dropna(subset=['actual_wind_speed', 'measured_wind_speed1'], how='all')
MoTIe_data_wind.isna().sum()

measured_wind_speed1        5861
actual_wind_speed       10173243
time                           0
station_id                     0
network_id                     0
dtype: int64

Compare values between columns that are measuring the same variable

In [87]:
# check whether columns measuring the same value have any overlap

# wind speed
# create a mask where column with the fewest nulls has no null values
not_null_value = MoTIe_data_wind['measured_wind_speed1'].notnull()
not_null_value_df = MoTIe_data_wind[not_null_value]

# check for rows where both columns contain a value
print(not_null_value_df['actual_wind_speed'].notnull().sum())

6062619


In [88]:
# narrow dataset down to one column for wind speed
# rationale: use variable with least number of nulls
MoTIe_data_wind_clean = MoTIe_data_wind.loc[:, ('time', 'measured_wind_speed1', 'station_id', 'network_id')]

In [89]:
# check values and units of wind speed columns
# station data file states values are in km/h
MoTIe_data_wind_clean.head(15)

Unnamed: 0,time,measured_wind_speed1,station_id,network_id
0,2001-01-22 13:00:00,7.0,11091,MoTIe
1,2001-01-22 14:00:00,8.0,11091,MoTIe
2,2001-01-22 15:00:00,8.0,11091,MoTIe
3,2001-01-22 16:00:00,8.0,11091,MoTIe
4,2001-01-22 17:00:00,10.0,11091,MoTIe
5,2001-01-22 18:00:00,9.0,11091,MoTIe
6,2001-01-22 19:00:00,8.0,11091,MoTIe
7,2001-01-22 20:00:00,9.0,11091,MoTIe
8,2001-01-22 21:00:00,9.0,11091,MoTIe
9,2001-01-22 22:00:00,10.0,11091,MoTIe


In [90]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MoTIe_data_wind_clean.loc[:,('measured_wind_speed1')] = MoTIe_data_wind_clean.loc[:,('measured_wind_speed1')].div(3.6)
MoTIe_data_wind_clean.head(15)

Unnamed: 0,time,measured_wind_speed1,station_id,network_id
0,2001-01-22 13:00:00,1.944444,11091,MoTIe
1,2001-01-22 14:00:00,2.222222,11091,MoTIe
2,2001-01-22 15:00:00,2.222222,11091,MoTIe
3,2001-01-22 16:00:00,2.222222,11091,MoTIe
4,2001-01-22 17:00:00,2.777778,11091,MoTIe
5,2001-01-22 18:00:00,2.5,11091,MoTIe
6,2001-01-22 19:00:00,2.222222,11091,MoTIe
7,2001-01-22 20:00:00,2.5,11091,MoTIe
8,2001-01-22 21:00:00,2.5,11091,MoTIe
9,2001-01-22 22:00:00,2.777778,11091,MoTIe


In [91]:
# remove values outside reasonable range
MoTIe_data_wind_clean = MoTIe_data_wind_clean[MoTIe_data_wind_clean['measured_wind_speed1'] < 65]
MoTIe_data_wind_clean = MoTIe_data_wind_clean[MoTIe_data_wind_clean['measured_wind_speed1'] >= 0]

In [92]:
# rename columns
MoTIe_wind = MoTIe_data_wind_clean.rename(columns={"measured_wind_speed1": "wind_speed"})

In [None]:
# replace station_id values if necessary
# not necessary

In [93]:
MoTIe_wind.head(10)

Unnamed: 0,time,wind_speed,station_id,network_id
0,2001-01-22 13:00:00,1.944444,11091,MoTIe
1,2001-01-22 14:00:00,2.222222,11091,MoTIe
2,2001-01-22 15:00:00,2.222222,11091,MoTIe
3,2001-01-22 16:00:00,2.222222,11091,MoTIe
4,2001-01-22 17:00:00,2.777778,11091,MoTIe
5,2001-01-22 18:00:00,2.5,11091,MoTIe
6,2001-01-22 19:00:00,2.222222,11091,MoTIe
7,2001-01-22 20:00:00,2.5,11091,MoTIe
8,2001-01-22 21:00:00,2.5,11091,MoTIe
9,2001-01-22 22:00:00,2.777778,11091,MoTIe


In [94]:
# save to csv
MoTIe_wind.to_csv(f'{path_2}/MoTIe_wind.csv', index=False)

#### MoTIm Data Cleaning

In [96]:
MoTIm_data = functions.to_df('MoTIm_data_clean.csv')

In [97]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MoTIm_data)

In [98]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MoTIm_data.shape

column names: ['current_air_temperature1', 'relative_humidity1', 'storm_snow', 'standard_snow', 'minimum_air_temperature', 'maximum_air_temperature', 'time', 'precipitation_new', 'height_of_snow', 'station_id', 'network_id', 'interval_snow', 'measured_wind_speed1', 'measured_wind_direction1']
column nulls: current_air_temperature1      9742
relative_humidity1          409244
storm_snow                  466916
standard_snow               208950
minimum_air_temperature      32973
maximum_air_temperature      33700
time                             0
precipitation_new           432471
height_of_snow              106445
station_id                       0
network_id                       0
interval_snow               685431
measured_wind_speed1        746336
measured_wind_direction1    763853
dtype: int64
duplicate rows: Index([], dtype='int64')


(769279, 14)

In [99]:
# only keep variables of interest
# no solar data available from this network
MoTIm_data_wind = MoTIm_data.loc[:,('time', 'measured_wind_speed1', 'station_id', 'network_id')] 

In [109]:
# count null values in wind data
MoTIm_data_wind.isna().sum()

time                         0
measured_wind_speed1    746336
station_id                   0
network_id                   0
dtype: int64

In [110]:
# drop rows in wind data that do not have wind speed values
MoTIm_data_wind_clean = MoTIm_data_wind.dropna(subset=['measured_wind_speed1'])
MoTIm_data_wind_clean.isna().sum()

time                    0
measured_wind_speed1    0
station_id              0
network_id              0
dtype: int64

In [111]:
# check wind speed values, units
# no units recorded in station data file
MoTIm_data_wind_clean['measured_wind_speed1'].head(15)

5667     5.0
6865     5.0
6867     5.0
6872     1.0
6875     1.0
6881     5.0
6889     5.0
6893     5.0
7318     5.0
7326     2.0
7327     2.0
7329    12.0
7340    30.0
7343     2.0
7347     5.0
Name: measured_wind_speed1, dtype: float64

In [112]:
# check wind speed max values
MoTIm_data_wind_clean.nlargest(3, 'measured_wind_speed1')


Unnamed: 0,time,measured_wind_speed1,station_id,network_id
515430,1986-12-11 05:30:00,80.0,43101,MoTIm
662478,1985-11-09 07:00:00,80.0,53001,MoTIm
392116,1984-12-29 06:00:00,70.0,38001,MoTIm


In [113]:
# check windspeed median value
MoTIm_data_wind_clean.median(numeric_only=True)

measured_wind_speed1        0.0
station_id              26101.0
dtype: float64

In [114]:
# check windspeed mean value
MoTIm_data_wind_clean.mean(numeric_only=True)

measured_wind_speed1        5.128405
station_id              26449.262520
dtype: float64

In [None]:
# based on max outside of reasonable range, and over half of the values recorded = 0 (indicated by median = 0), assumed wind speed units were km/h

In [115]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MoTIm_data_wind_clean.loc[:,'measured_wind_speed1'] = MoTIm_data_wind_clean.loc[:,'measured_wind_speed1'].div(3.6)
MoTIm_data_wind_clean.head(5)

Unnamed: 0,time,measured_wind_speed1,station_id,network_id
5667,1980-01-30 05:30:00,1.388889,11101,MoTIm
6865,1986-03-09 16:00:00,1.388889,11101,MoTIm
6867,1986-03-10 16:00:00,1.388889,11101,MoTIm
6872,1986-03-13 06:00:00,0.277778,11101,MoTIm
6875,1986-03-14 16:00:00,0.277778,11101,MoTIm


In [116]:
# remove values outside of reasonable range
MoTIm_data_wind_clean = MoTIm_data_wind_clean[MoTIm_data_wind_clean['measured_wind_speed1'] < 65]
MoTIm_data_wind_clean = MoTIm_data_wind_clean[MoTIm_data_wind_clean['measured_wind_speed1'] >= 0]

In [117]:
# rename columns
MoTIm_wind = MoTIm_data_wind_clean.rename(columns={"measured_wind_speed1": "wind_speed"})

In [None]:
# replace station_id values if necessary
# not necessary

In [118]:
# save to csv
MoTIm_wind.to_csv(f'{path_2}/MoTIm_wind.csv', index=False)

#### MVan Data Cleaning

In [119]:
MVan_data = functions.to_df('MVan_data_clean.csv')

In [120]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MVan_data)

In [121]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MVan_data.shape

column names: ['wdir_vect', 'time', 'wspd_sclr', 'humidity', 'station_id', 'network_id']
column nulls: wdir_vect       6022
time               0
wspd_sclr     157014
humidity      104506
station_id         0
network_id         0
dtype: int64
duplicate rows: Index([], dtype='int64')


(283907, 6)

In [122]:
# only keep variables of interest
# no solar data available from this network
MVan_data_wind = MVan_data.loc[:,('time', 'wspd_sclr', 'station_id', 'network_id')]

In [123]:
# count null values in wind data
MVan_data_wind.isna().sum()

time               0
wspd_sclr     157014
station_id         0
network_id         0
dtype: int64

In [124]:
# drop rows in wind data that do not have WindSpeed values
MVan_data_wind_clean = MVan_data_wind.dropna(subset=['wspd_sclr'])
MVan_data_wind_clean.isna().sum()

time          0
wspd_sclr     0
station_id    0
network_id    0
dtype: int64

In [125]:
# check wind speed values, units
# station data file indicates units are km/h
MVan_data_wind_clean.head(5)

Unnamed: 0,time,wspd_sclr,station_id,network_id
0,2000-01-01 00:00:00,1.38889,T12,MVan
1,2000-01-01 01:00:00,3.33334,T12,MVan
2,2000-01-01 02:00:00,1.66667,T12,MVan
3,2000-01-01 03:00:00,2.22222,T12,MVan
4,2000-01-01 04:00:00,1.94445,T12,MVan


In [126]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MVan_data_wind_clean.loc[:,'wspd_sclr'] = MVan_data_wind_clean.loc[:,'wspd_sclr'].div(3.6)
MVan_data_wind_clean.head(5)

Unnamed: 0,time,wspd_sclr,station_id,network_id
0,2000-01-01 00:00:00,0.385803,T12,MVan
1,2000-01-01 01:00:00,0.925928,T12,MVan
2,2000-01-01 02:00:00,0.462964,T12,MVan
3,2000-01-01 03:00:00,0.617283,T12,MVan
4,2000-01-01 04:00:00,0.540125,T12,MVan


In [127]:
# remove values outside reasonable range
MVan_data_wind_clean = MVan_data_wind_clean[MVan_data_wind_clean['wspd_sclr'] < 65]
MVan_data_wind_clean = MVan_data_wind_clean[MVan_data_wind_clean['wspd_sclr'] >= 0]

In [128]:
# rename columns
MVan_wind = MVan_data_wind_clean.rename(columns={"wspd_sclr": "wind_speed"})

In [None]:
# replace station_id values if necessary
# not necessary

In [129]:
# save to csv
MVan_wind.to_csv(f'{path_2}/MVan_wind.csv', index=False)

#### UNBC_CAM Data Cleaning

In [130]:
UNBC_CAM_data = functions.to_df('UNBC_CAM_data_clean.csv')

In [131]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(UNBC_CAM_data)

In [132]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
UNBC_CAM_data.shape

column names: ['rain_tot', 'solar_avg', 'solar_std', 'ws_std', 'airtc', 'time', 'rh', 'winddir_avg', 'dbtcdt', 'station_id', 'network_id', 'pressure', 'winddir_std', 'ws_avg', 'k_dn_avg', 'l_dn_avg', 'l_up_avg', 'k_up_avg']
column nulls: rain_tot        693656
solar_avg      1949847
solar_std      2097928
ws_std          274907
airtc           178388
time                 0
rh              145556
winddir_avg     780710
dbtcdt          307674
station_id           0
network_id           0
pressure        811623
winddir_std    2392832
ws_avg         2045467
k_dn_avg       2305685
l_dn_avg       2305685
l_up_avg       2305685
k_up_avg       2305685
dtype: int64
duplicate rows: Index([], dtype='int64')


(2634998, 18)

In [133]:
# only keep variables of interest
UNBC_CAM_data_solar = UNBC_CAM_data.loc[:,('time', 'solar_avg', 'station_id', 'network_id')]
UNBC_CAM_data_wind = UNBC_CAM_data.loc[:,('time', 'ws_avg', 'station_id', 'network_id')]

In [134]:
# count null values in solar data
UNBC_CAM_data_solar.isna().sum()


time                0
solar_avg     1949847
station_id          0
network_id          0
dtype: int64

In [135]:
# drop rows in solar data that do not have solar radiation values
UNBC_CAM_data_solar_clean = UNBC_CAM_data_solar.dropna(subset=['solar_avg'])
UNBC_CAM_data_solar_clean.isna().sum()

time          0
solar_avg     0
station_id    0
network_id    0
dtype: int64

In [136]:
# remove values outside of reasonable range
UNBC_CAM_data_solar_clean = UNBC_CAM_data_solar_clean[UNBC_CAM_data_solar_clean['solar_avg'] < 950]
UNBC_CAM_data_solar_clean = UNBC_CAM_data_solar_clean[UNBC_CAM_data_solar_clean['solar_avg'] >= 0]

In [137]:
# rename columns
UNBC_CAM_solar = UNBC_CAM_data_solar_clean.rename(columns={"solar_avg": "solar_radiation"})
UNBC_CAM_solar

Unnamed: 0,time,solar_radiation,station_id,network_id
0,2010-01-01 00:00:00,0.00,ancifore,UNBC_CAM
1,2010-01-01 00:15:00,0.00,ancifore,UNBC_CAM
2,2010-01-01 00:29:59,0.00,ancifore,UNBC_CAM
3,2010-01-01 00:45:00,0.00,ancifore,UNBC_CAM
4,2010-01-01 01:00:00,0.00,ancifore,UNBC_CAM
...,...,...,...,...
2279813,2017-11-17 13:45:00,17.93,tatulake,UNBC_CAM
2279814,2017-11-17 14:00:00,20.04,tatulake,UNBC_CAM
2279815,2017-11-17 14:15:00,19.51,tatulake,UNBC_CAM
2279816,2017-11-17 14:30:00,26.36,tatulake,UNBC_CAM


In [None]:
# replace network_id column values if necessary
# not necessary

In [138]:
UNBC_CAM_solar.to_csv(f'{path_2}/UNBC_CAM_solar.csv', index=False)

In [139]:
# count null values in wind data
UNBC_CAM_data_wind.isna().sum()

time                0
ws_avg        2045467
station_id          0
network_id          0
dtype: int64

In [140]:
# drop rows in wind data that do not have WindSpeed values
UNBC_CAM_data_wind_clean = UNBC_CAM_data_wind.dropna(subset=['ws_avg'])
UNBC_CAM_data_wind_clean.isna().sum()

time          0
ws_avg        0
station_id    0
network_id    0
dtype: int64

In [142]:
# check wind speed values, units
# station data file states wind speed units are m/s
UNBC_CAM_data_wind_clean.nlargest(3, 'ws_avg')

Unnamed: 0,time,ws_avg,station_id,network_id
809545,2011-11-27 18:45:00,13.02,lowecast,UNBC_CAM
808114,2011-11-12 21:00:00,12.7,lowecast,UNBC_CAM
878341,2013-12-17 22:30:00,12.58,lowecast,UNBC_CAM


In [143]:
# remove values outside reasonable range
UNBC_CAM_data_wind_clean = UNBC_CAM_data_wind_clean[UNBC_CAM_data_wind_clean['ws_avg'] < 65]
UNBC_CAM_data_wind_clean = UNBC_CAM_data_wind_clean[UNBC_CAM_data_wind_clean['ws_avg'] >= 0]

In [144]:
# rename columns
UNBC_CAM_wind = UNBC_CAM_data_wind_clean.rename(columns={"ws_avg": "wind_speed"})
UNBC_CAM_wind

Unnamed: 0,time,wind_speed,station_id,network_id
748750,2010-01-01 00:00:00,6.214,lowecast,UNBC_CAM
748751,2010-01-01 00:15:00,5.291,lowecast,UNBC_CAM
748752,2010-01-01 00:29:59,5.138,lowecast,UNBC_CAM
748753,2010-01-01 00:45:00,6.429,lowecast,UNBC_CAM
748754,2010-01-01 01:00:00,6.987,lowecast,UNBC_CAM
...,...,...,...,...
1860321,2017-12-31 22:45:00,0.000,qrrc,UNBC_CAM
1860322,2017-12-31 23:00:00,0.000,qrrc,UNBC_CAM
1860323,2017-12-31 23:15:00,0.000,qrrc,UNBC_CAM
1860324,2017-12-31 23:30:00,0.000,qrrc,UNBC_CAM


In [None]:
# replace network_id values if necessary
# not necessary

In [145]:
# save to csv
UNBC_CAM_wind.to_csv(f'{path_2}/UNBC_CAM_wind.csv', index=False)

### Confirm that all column names are standardized

In [146]:
# Create list of wind and solar filenames

keywords = ['wind', 'solar']
# create list to store wind records
wind_files = []
# create list to store solar records
solar_files = []
# create list of files in data folder
file_names = os.listdir(f"{path_2}")
# iterate through files in data folder
for file_name in file_names:
    # match against keywords
    if keywords[0] in file_name:
        wind_files.append(file_name)
    elif keywords[1] in file_name:
        solar_files.append(file_name)
    else: continue

print(f"solar files: {solar_files}")
print(f"wind files: {wind_files}")

solar files: ['BCH_solar.csv', 'CRD_solar.csv', 'FLNRO_FERN_solar.csv', 'UNBC_CAM_solar.csv']
wind files: ['BCH_wind.csv', 'CRD_wind.csv', 'EC_raw_wind.csv', 'ENV_AQN_wind.csv', 'FLNRO_FERN_wind.csv', 'MoTIe_wind.csv', 'MoTIm_wind.csv', 'MVan_wind.csv', 'UNBC_CAM_wind.csv']


In [147]:
# print list of column names for each solar file
for file in solar_files:
    print(f"{file}")
    print(list(pd.read_csv(f'{path_2}/{file}', index_col=0, nrows=0)))


BCH_solar.csv
['solar_radiation', 'station_id', 'network_id']
CRD_solar.csv
['solar_radiation', 'station_id', 'network_id']
FLNRO_FERN_solar.csv
['solar_radiation', 'station_id', 'network_id']
UNBC_CAM_solar.csv
['solar_radiation', 'station_id', 'network_id']


In [148]:
# print list of column names for each wind file
for file in wind_files:
    print(f"{file}")
    print(list(pd.read_csv(f'{path_2}/{file}', index_col=0, nrows=0)))

BCH_wind.csv
['wind_speed', 'station_id', 'network_id']
CRD_wind.csv
['wind_speed', 'station_id', 'network_id']
EC_raw_wind.csv
['wind_speed', 'station_id', 'network_id']
ENV_AQN_wind.csv
['wind_speed', 'station_id', 'network_id']
FLNRO_FERN_wind.csv
['wind_speed', 'station_id', 'network_id']
MoTIe_wind.csv
['wind_speed', 'station_id', 'network_id']
MoTIm_wind.csv
['wind_speed', 'station_id', 'network_id']
MVan_wind.csv
['wind_speed', 'station_id', 'network_id']
UNBC_CAM_wind.csv
['wind_speed', 'station_id', 'network_id']
