### Initializing

In [2]:
import pandas as pd
import os 
import functions

In [3]:
# test for proper access to functions module
functions.temp()

hello


In [4]:
# define path variable for data folder containing concatenated .csv files
path_2 = '../data'

### Data cleaning
Objectives:
- remove irrelevant/null columns
- keep columns variables associated with wind power generation
    - wind speed
    - air density
        - air pressure
        - air temperature
        - relative humidity
- keep column variables associated with solar power generation
    - solar radiation
    - cloud cover fraction?
    - sunlight duration?
- standardize column data with common variables (units, format)

#### BCH Data Cleaning

In [6]:
BCH_data_clean = functions.to_df('BCH_data_clean.csv')
functions.list_columns(BCH_data_clean)

['windspeed',
 'time',
 'winddirection',
 'station_id',
 'network_id',
 'l_down_corr_avg',
 'hft3_1_avg',
 'l_up_avg',
 'one_day_snow',
 'k_up_avg',
 'l_up_corr_avg',
 'airtemp',
 'l_down_avg',
 'hft3_2_avg',
 'k_down_avg',
 'barometricpressure',
 'soilvolumetricwatercontent',
 'relativehumidity',
 'tsoil_avg',
 'vis',
 'one_day_rain',
 'one_day_precipitation',
 'min_temp',
 'max_temp',
 'snow_we',
 'snow_on_the_ground',
 'netrad']

In [8]:
# only include variables of interest
BCH_data_wind = BCH_data_clean.loc[:,('time', 'windspeed', 'barometricpressure', 'airtemp', 'relativehumidity', 'station_id', 'network_id')]
BCH_data_solar = BCH_data_clean.loc[:,('time', 'netrad', 'station_id', 'network_id')]

##### BCH Wind Data Cleaning

In [9]:
# count null values in wind data
BCH_data_wind.isna().sum()

time                        0
windspeed             6797851
barometricpressure    6833815
airtemp               1306287
relativehumidity      6827106
station_id                  0
network_id                  0
dtype: int64

In [10]:
# drop rows in wind data that do not have a wind speed value
# wind speed is a key variable that is critical for location-specific energy calculations
BCH_data_wind_clean = BCH_data_wind.dropna(subset=['windspeed'])
BCH_data_wind_clean.isna().sum()

time                      0
windspeed                 0
barometricpressure    36272
airtemp               29564
relativehumidity      29567
station_id                0
network_id                0
dtype: int64

In [11]:
# check whether number of remaining nulls are the same as the number of rows
BCH_data_wind_clean.shape

(218988, 7)

In [12]:
# check for duplicates
duplicate_rows = BCH_data_wind_clean.duplicated()
duplicate_rows[duplicate_rows].index

Index([], dtype='int64')

In [13]:
# rename columns
BCH_wind = BCH_data_wind_clean.rename(columns={"windspeed": "wind_speed", "barometricpressure": "air_pressure", "airtemp": "air_temperature", "relativehumidity": "relative_humidity"})
BCH_wind

Unnamed: 0,time,wind_speed,air_pressure,air_temperature,relative_humidity,station_id,network_id
0,2017-01-01 01:00:00,5.7,,,,85A,BCH
1,2017-01-01 02:00:00,5.9,,,,85A,BCH
2,2017-01-01 03:00:00,5.8,,,,85A,BCH
3,2017-01-01 04:00:00,4.7,,,,85A,BCH
4,2017-01-01 05:00:00,4.8,,,,85A,BCH
...,...,...,...,...,...,...,...
6280365,2020-06-30 20:00:00,1.6,1011.0,14.2,82.6,TEC,BCH
6280366,2020-06-30 21:00:00,1.7,1011.0,14.3,86.2,TEC,BCH
6280367,2020-06-30 22:00:00,0.8,1011.0,13.7,88.7,TEC,BCH
6280368,2020-06-30 23:00:00,1.7,1011.0,13.6,89.4,TEC,BCH


In [14]:
# save to csv
BCH_wind.to_csv('../data/BCH_wind.csv', index=False)

Plan for handling remaining nulls:
- use values from next nearest weather station
- use averages from data as a whole if necessary

##### BCH Solar Data Cleaning

In [15]:
# count null values in solar data
BCH_data_solar.isna().sum()

time                0
netrad        6928912
station_id          0
network_id          0
dtype: int64

In [16]:
# drop rows in solar data that do not have a NetRad value
BCH_data_solar_clean = BCH_data_solar.dropna(subset=['netrad'])
BCH_data_solar_clean.isna().sum()

time          0
netrad        0
station_id    0
network_id    0
dtype: int64

In [17]:
# check for duplicates
duplicate_rows = BCH_data_solar_clean.duplicated()
duplicate_rows[duplicate_rows].index

Index([], dtype='int64')

In [36]:
BCH_solar = BCH_data_solar_clean.rename(columns={"netrad": "solar_radiation"})
BCH_solar

Unnamed: 0,time,solar_radiation,station_id,network_id
495016,2017-09-29 13:00:00,275.2,ATP,BCH
495017,2017-09-29 14:00:00,222.4,ATP,BCH
495018,2017-09-29 15:00:00,86.3,ATP,BCH
495019,2017-09-29 16:00:00,14.0,ATP,BCH
495020,2017-09-29 17:00:00,1.7,ATP,BCH
...,...,...,...,...
6258061,2020-06-30 20:00:00,-16.2,TAB,BCH
6258062,2020-06-30 21:00:00,-20.3,TAB,BCH
6258063,2020-06-30 22:00:00,-15.2,TAB,BCH
6258064,2020-06-30 23:00:00,-13.3,TAB,BCH


In [37]:
# save BCH solar data to csv
BCH_solar.to_csv('../data/BCH_solar.csv', index=False)

#### CRD Data Cleaning

In [19]:
CRD_data = functions.to_df('CRD_data_clean.csv')

In [20]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(CRD_data)

In [21]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
CRD_data.shape

column names: ['solarradiation', 'airtemperature', 'precipitation', 'rain', 'windspeed', 'snowdepth', 'time', 'winddirection', 'relativehumidity', 'station_id', 'network_id', 'snowwaterequivalent']
column nulls: solarradiation         4090779
airtemperature          180523
precipitation          3328235
rain                    438757
windspeed               655130
snowdepth              3711426
time                         0
winddirection           658523
relativehumidity        591010
station_id                   0
network_id                   0
snowwaterequivalent    4078683
dtype: int64
duplicate rows: Index([], dtype='int64')


(4100571, 12)

In [22]:
# only keep variables of interest
CRD_data_solar = CRD_data.loc[:,('time', 'solarradiation', 'station_id', 'network_id')]
CRD_data_wind = CRD_data.loc[:,('time', 'windspeed', 'airtemperature', 'relativehumidity', 'station_id', 'network_id')] 
    # note that no air pressure value is available

In [23]:
# drop rows in solar data that do not have SolarRadiation values
CRD_data_solar_clean = CRD_data_solar.dropna(subset=['solarradiation'])
CRD_data_solar_clean.isna().sum()

time              0
solarradiation    0
station_id        0
network_id        0
dtype: int64

In [38]:
CRD_solar = CRD_data_solar_clean.rename(columns={"solarradiation": "solar_radiation"})

In [39]:
CRD_solar.to_csv(f'{path_2}/CRD_solar.csv', index=False)

In [40]:
# count null values in wind data
CRD_data_wind.isna().sum()

time                     0
windspeed           655130
airtemperature      180523
relativehumidity    591010
station_id               0
network_id               0
dtype: int64

In [28]:
# drop rows in wind data that do not have WindSpeed values
CRD_data_wind_clean = CRD_data_wind.dropna(subset=['windspeed'])
CRD_data_wind_clean.isna().sum()

time                     0
windspeed                0
airtemperature      179949
relativehumidity      1181
station_id               0
network_id               0
dtype: int64

In [29]:
# check windspeed values 
CRD_data_wind_clean.head(15)

Unnamed: 0,time,windspeed,airtemperature,relativehumidity,station_id,network_id
0,1998-04-17 00:00:00,0.0,0.9,100.0,FW001,CRD
1,1998-04-17 01:00:00,0.0,1.2,100.0,FW001,CRD
2,1998-04-17 02:00:00,0.0,0.7,100.0,FW001,CRD
3,1998-04-17 03:00:00,0.0,0.9,100.0,FW001,CRD
4,1998-04-17 04:00:00,0.0,0.1,100.0,FW001,CRD
5,1998-04-17 05:00:00,0.0,0.1,100.0,FW001,CRD
6,1998-04-17 06:00:00,0.0,0.4,100.0,FW001,CRD
7,1998-04-17 07:00:00,0.0,2.7,100.0,FW001,CRD
8,1998-04-17 08:00:00,0.0,8.2,74.0,FW001,CRD
9,1998-04-17 09:00:00,3.8,9.4,64.0,FW001,CRD


In [30]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
CRD_data_wind_clean.loc[:,'windspeed'] = CRD_data_wind_clean.loc[:,'windspeed'].div(3.6)
CRD_data_wind_clean.head(15)

Unnamed: 0,time,windspeed,airtemperature,relativehumidity,station_id,network_id
0,1998-04-17 00:00:00,0.0,0.9,100.0,FW001,CRD
1,1998-04-17 01:00:00,0.0,1.2,100.0,FW001,CRD
2,1998-04-17 02:00:00,0.0,0.7,100.0,FW001,CRD
3,1998-04-17 03:00:00,0.0,0.9,100.0,FW001,CRD
4,1998-04-17 04:00:00,0.0,0.1,100.0,FW001,CRD
5,1998-04-17 05:00:00,0.0,0.1,100.0,FW001,CRD
6,1998-04-17 06:00:00,0.0,0.4,100.0,FW001,CRD
7,1998-04-17 07:00:00,0.0,2.7,100.0,FW001,CRD
8,1998-04-17 08:00:00,0.0,8.2,74.0,FW001,CRD
9,1998-04-17 09:00:00,1.055556,9.4,64.0,FW001,CRD


In [41]:
# rename columns
CRD_wind = CRD_data_wind_clean.rename(columns={"windspeed":"wind_speed", "airtemperature": "air_temperature", "relativehumidity":"relative_humidity"})

In [42]:
# save to csv
CRD_wind.to_csv(f'{path_2}/CRD_wind.csv', index=False)

#### EC_raw Data Cleaning

In [33]:
EC_raw_data = functions.to_df('EC_raw_data_clean.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv')


In [34]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(EC_raw_data)

In [35]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['wind_speed', 'mean_sea_level', 'dew_point', 'wind_direction', 'relative_humidity', 'air_temperature', 'time', 'air_temperature_yesterday_low', 'wind_gust_speed', 'air_temperature_yesterday_high', 'tendency_amount', 'station_id', 'network_id', 'total_precipitation', 'snow_amount', 'total_cloud_cover']
column nulls: wind_speed                         1012435
mean_sea_level                     1326710
dew_point                           932842
wind_direction                     1834304
relative_humidity                   935655
air_temperature                     411231
time                                     0
air_temperature_yesterday_low     12391030
wind_gust_speed                   11440570
air_temperature_yesterday_high    12390760
tendency_amount                    2839650
station_id                               0
network_id                               0
total_precipitation               12295547
snow_amount                       12736774
total_cloud_cover      

In [43]:
# only keep variables of interest
# no solar data available from this network
EC_raw_data_wind = EC_raw_data.loc[:,('time', 'wind_speed', 'air_temperature', 'relative_humidity', 'station_id', 'network_id')] 
    # note that no air pressure column is available

In [44]:
# count null values in wind data
EC_raw_data_wind.isna().sum()

time                       0
wind_speed           1012435
air_temperature       411231
relative_humidity     935655
station_id                 0
network_id                 0
dtype: int64

In [45]:
# drop rows in wind data that do not have values in windspeed column
EC_raw_data_wind_clean = EC_raw_data_wind.dropna(subset=['wind_speed'])
EC_raw_data_wind_clean.isna().sum()

time                      0
wind_speed                0
air_temperature      331537
relative_humidity    802170
station_id                0
network_id                0
dtype: int64

In [47]:
# check windspeed units
EC_raw_data_wind_clean.head(15)
# station metadata states km/h

Unnamed: 0,time,wind_speed,air_temperature,relative_humidity,station_id,network_id
138,2011-10-16 00:00:00,16.9,9.9,78.0,1012475,EC_raw
139,2011-10-16 01:00:00,16.2,9.9,78.0,1012475,EC_raw
140,2011-10-16 02:00:00,10.4,9.7,82.0,1012475,EC_raw
141,2011-10-16 03:00:00,9.4,9.3,83.0,1012475,EC_raw
142,2011-10-16 04:00:00,13.3,9.1,83.0,1012475,EC_raw
143,2011-10-16 05:00:00,15.5,9.1,84.0,1012475,EC_raw
144,2011-10-16 06:00:00,17.3,8.8,84.0,1012475,EC_raw
145,2011-10-16 07:00:00,16.9,8.8,83.0,1012475,EC_raw
146,2011-10-16 08:00:00,16.2,8.4,81.0,1012475,EC_raw
147,2011-10-16 09:00:00,8.3,7.7,83.0,1012475,EC_raw


In [48]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
EC_raw_data_wind_clean.loc[:,('wind_speed')] = EC_raw_data_wind_clean.loc[:,('wind_speed')].div(3.6)
EC_raw_data_wind_clean.head(15)

Unnamed: 0,time,wind_speed,air_temperature,relative_humidity,station_id,network_id
138,2011-10-16 00:00:00,4.694444,9.9,78.0,1012475,EC_raw
139,2011-10-16 01:00:00,4.5,9.9,78.0,1012475,EC_raw
140,2011-10-16 02:00:00,2.888889,9.7,82.0,1012475,EC_raw
141,2011-10-16 03:00:00,2.611111,9.3,83.0,1012475,EC_raw
142,2011-10-16 04:00:00,3.694444,9.1,83.0,1012475,EC_raw
143,2011-10-16 05:00:00,4.305556,9.1,84.0,1012475,EC_raw
144,2011-10-16 06:00:00,4.805556,8.8,84.0,1012475,EC_raw
145,2011-10-16 07:00:00,4.694444,8.8,83.0,1012475,EC_raw
146,2011-10-16 08:00:00,4.5,8.4,81.0,1012475,EC_raw
147,2011-10-16 09:00:00,2.305556,7.7,83.0,1012475,EC_raw


In [None]:
# rename columns
# not required

In [65]:
# save to csv
EC_raw_data_wind_clean.to_csv(f'{path_2}/EC_raw_wind.csv', index=False)

#### ENV_AQN Data Cleaning

In [51]:
ENV_AQN_data = functions.to_df('ENV_AQN_data_clean.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv')


In [52]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(ENV_AQN_data)

In [53]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['wdir_vect', 'temp_mean', 'wspd_sclr', 'precip_total', 'avg_rel_hum_pst1hr', 'time', 'humidity', 'avg_air_temp_pst1hr', 'station_id', 'network_id', 'bar_press']
column nulls: wdir_vect               6568691
temp_mean               5413225
wspd_sclr               2296217
precip_total           12783840
avg_rel_hum_pst1hr      8968906
time                          0
humidity                8688631
avg_air_temp_pst1hr     7985418
station_id                    0
network_id                    0
bar_press              12962966
dtype: int64
duplicate rows: Index([], dtype='int64')


In [54]:
# only keep variables of interest
# no solar data available for this network
ENV_AQN_data_wind = ENV_AQN_data.loc[:,('time', 'wspd_sclr', 'temp_mean', 'humidity', 'bar_press', 'station_id', 'network_id')] 

In [55]:
# count null values in wind data
ENV_AQN_data_wind.isna().sum()

time                 0
wspd_sclr      2296217
temp_mean      5413225
humidity       8688631
bar_press     12962966
station_id           0
network_id           0
dtype: int64

In [56]:
# drop rows in wind data that do not have windspeed values
ENV_AQN_data_wind_clean = ENV_AQN_data_wind.dropna(subset=['wspd_sclr'])
ENV_AQN_data_wind_clean.isna().sum()

time                 0
wspd_sclr            0
temp_mean      3757129
humidity       7892770
bar_press     10684548
station_id           0
network_id           0
dtype: int64

In [38]:
# check windspeed values
ENV_AQN_data_wind_clean['wspd_sclr'].head(15)

0     1.044
1     2.518
2     1.028
3     0.704
4     0.208
5     0.140
6     0.534
7     0.457
8     0.916
9     0.303
10    0.403
11    0.378
12    0.750
13    0.576
14    0.224
Name:  WSPD_SCLR, dtype: float64

In [61]:
# check windspeed max values
ENV_AQN_data_wind_clean.nlargest(50, 'wspd_sclr')


Unnamed: 0,time,wspd_sclr,temp_mean,humidity,bar_press,station_id,network_id
5579171,2009-01-03 00:00:00,299.872,,,,E257415,ENV-AQN
5579165,2009-01-02 18:00:00,299.676,,,,E257415,ENV-AQN
5579168,2009-01-02 21:00:00,299.667,,,,E257415,ENV-AQN
5579169,2009-01-02 22:00:00,299.666,,,,E257415,ENV-AQN
5579170,2009-01-02 23:00:00,299.666,,,,E257415,ENV-AQN
5579167,2009-01-02 20:00:00,299.66,,,,E257415,ENV-AQN
5579166,2009-01-02 19:00:00,299.659,,,,E257415,ENV-AQN
5579151,2009-01-02 04:00:00,299.542,,,,E257415,ENV-AQN
5580311,2009-02-19 12:00:00,299.351,,,,E257415,ENV-AQN
5580052,2009-02-08 17:00:00,298.838,,,,E257415,ENV-AQN


*Note: no units were provided for wind speed column. Based on maximum values, assumed that units were km/h.*

In [62]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
ENV_AQN_data_wind_clean.loc[:,'wspd_sclr'] = ENV_AQN_data_wind_clean.loc[:,'wspd_sclr'].div(3.6)
ENV_AQN_data_wind_clean.head(15)

Unnamed: 0,time,wspd_sclr,temp_mean,humidity,bar_press,station_id,network_id
0,1998-03-12 15:00:00,0.29,12.17,,,110031,ENV-AQN
1,1998-03-12 16:00:00,0.699444,11.8,,,110031,ENV-AQN
2,1998-03-12 17:00:00,0.285556,11.29,,,110031,ENV-AQN
3,1998-03-12 18:00:00,0.195556,10.7,,,110031,ENV-AQN
4,1998-03-12 19:00:00,0.057778,10.28,,,110031,ENV-AQN
5,1998-03-12 20:00:00,0.038889,10.15,,,110031,ENV-AQN
6,1998-03-12 21:00:00,0.148333,9.88,,,110031,ENV-AQN
7,1998-03-12 22:00:00,0.126944,9.8,,,110031,ENV-AQN
8,1998-03-12 23:00:00,0.254444,9.94,,,110031,ENV-AQN
9,1998-03-13 00:00:00,0.084167,9.85,,,110031,ENV-AQN


In [63]:
# rename columns
ENV_AQN_wind = ENV_AQN_data_wind_clean.rename(columns={"wspd_sclr": "wind_speed", "temp_mean": "air_temperature", "humidity": "relative_humidity", "bar_press": "air_pressure"})

In [66]:
# save to csv
ENV_AQN_wind.to_csv(f'{path_2}/ENV_AQN_wind.csv', index=False)

#### FLNRO_FERN Data Cleaning

In [67]:
FLNRO_FERN_data = functions.to_df('FLNRO_FERN_data.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv')


In [68]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(FLNRO_FERN_data)

In [69]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['tempc', 'solarradiationwm', 'rh', 'windspeedms', 'pressurembar', 'rainmm', 'gustspeedms', 'time', 'winddirection', 'dewptc', 'station_id', 'network_id', 'wetness', 'tx', 'tn', 'rhx', 'tm', 'rhn']
column nulls: tempc                 78077
solarradiationwm     155467
rh                    76028
windspeedms          159515
pressurembar         143098
rainmm               163976
gustspeedms          171360
time                      0
winddirection        398990
dewptc               187998
station_id                0
network_id                0
wetness             1441326
tx                  2310484
tn                  2310515
rhx                 2331262
tm                  2319798
rhn                 2331284
dtype: int64
duplicate rows: Index([], dtype='int64')


In [70]:
# only keep variables of interest
FLNRO_FERN_data_solar = FLNRO_FERN_data.loc[:,('time', 'solarradiationwm', 'station_id', 'network_id')]
FLNRO_FERN_data_wind = FLNRO_FERN_data.loc[:,('time', 'windspeedms', 'tempc', 'rh', 'pressurembar', 'station_id', 'network_id')]

In [71]:
# drop rows in solar data that do not have SolarRadiation values
FLNRO_FERN_data_solar_clean = FLNRO_FERN_data_solar.dropna(subset=['solarradiationwm'])
FLNRO_FERN_data_solar_clean.isna().sum()

time                0
solarradiationwm    0
station_id          0
network_id          0
dtype: int64

In [72]:
# rename columns
FLNRO_FERN_solar = FLNRO_FERN_data_solar_clean.rename(columns={"solarradiationwm": "solar_radiation"})

In [73]:
FLNRO_FERN_solar.to_csv(f'{path_2}/FLNRO_FERN_solar.csv', index=False)

In [74]:
# count null values in wind data
FLNRO_FERN_data_wind.isna().sum()

time                 0
windspeedms     159515
tempc            78077
rh               76028
pressurembar    143098
station_id           0
network_id           0
dtype: int64

In [75]:
# drop rows in wind data that do not have WindSpeed values
FLNRO_FERN_data_wind_clean = FLNRO_FERN_data_wind.dropna(subset=['windspeedms'])
FLNRO_FERN_data_wind_clean.isna().sum()

time                0
windspeedms         0
tempc           53919
rh              51069
pressurembar    53561
station_id          0
network_id          0
dtype: int64

In [76]:
# rename columns
FLNRO_FERN_wind = FLNRO_FERN_data_wind_clean.rename(columns={"windspeedms": "wind_speed", "tempc": "air_temperature", "rh": "relative_humidity", "pressurembar": "air_pressure"})

In [77]:
# save to csv
FLNRO_FERN_wind.to_csv(f'{path_2}/FLNRO_FERN_wind.csv', index=False)

#### FLNRO_WMB Data Cleaning

*Note: memory error occurred when trying to import data. Will either exclude from analysis or clean data in pgAdmin*

In [None]:
# FLNRO_WMB_data = functions.to_df('FLNRO_WMB_data.csv')

In [None]:
# column_names, column_nulls, duplicate_row_index = functions.start_cleaning(FLNRO_WMB_data)

In [None]:
# print(f'column names: {column_names}')
# print(f'column nulls: {column_nulls}')
# print(f'duplicate rows: {duplicate_row_index}')

In [None]:
# # only keep variables of interest
# FLNRO_WMB_data_solar = FLNRO_WMB_data.loc[:,('time', 'solarradiationwm', 'station_id', 'network_id')]
# FLNRO_WMB_data_wind = FLNRO_WMB_data.loc[:,('time', 'windspeedms', 'tempc', 'rh', 'pressurembar', 'station_id', 'network_id')]

In [None]:
# # drop rows in solar data that do not have SolarRadiation values
# FLNRO_WMB_data_solar_clean = FLNRO_WMB_data_solar.dropna(subset=['solarradiationwm'])
# FLNRO_WMB_data_solar_clean.isna().sum()

In [None]:
# # rename columns
# FLNRO_WMB_solar = FLNRO_WMB_data_solar_clean.rename(columns={"solarradiationwm": "solar_radiation"})

In [None]:
# FLNRO_WMB_solar.to_csv(f'{path_2}/FLNRO_WMB_solar.csv', index=False)

In [None]:
# # count null values in wind data
# FLNRO_WMB_data_wind.isna().sum()

In [None]:
# # drop rows in wind data that do not have WindSpeed values
# FLNRO_WMB_data_wind_clean = FLNRO_WMB_data_wind.dropna(subset=['windspeedms'])
# FLNRO_WMB_data_wind_clean.isna().sum()

In [None]:
# # rename columns
# FLNRO_WMB_wind = FLNRO_WMB_data_wind_clean.rename(columns={"windspeedms": "wind_speed", "tempc": "air_temperature", "rh": "relative_humidity", "pressurembar": "air_pressure"})

In [None]:
# # save to csv
# FLNRO_WMB_wind.to_csv(f'{path_2}/FLNRO_WMB_wind.csv', index=False)

#### MoTIe Data Cleaning

In [5]:
MoTIe_col_names = pd.read_csv(f'{path_2}/MoTIe_data_clean.csv', index_col=0, nrows=0)

In [6]:
list(MoTIe_col_names)

['standard_snow',
 'minimum_air_temperature',
 'max_wnd_spd_10m_pst1hr',
 'measured_wind_speed1',
 'measured_wind_direction1',
 'dwpt_temp',
 'min_air_temp_snc_last_reset',
 'dew_point',
 'atmospheric_pressure',
 'avg_wnd_dir_10m_pst10mts',
 'actual_wind_direction',
 'maximum_air_temperature',
 'snw_dpth',
 'maximum_measured_wind_speed1',
 'height_of_snow',
 'pcpn_amt_pst24hrs',
 'current_air_temperature2',
 'current_air_temperature1',
 'precipitation_new',
 'max_air_temp_snc_last_reset',
 'precip_detector_ratio',
 'actual_wind_speed',
 'wind_direction_std_deviation1',
 'avg_wnd_spd_10m_pst10mts',
 'air_temp',
 'mslp',
 'pcpn_amt_pst1hr',
 'snwfl_amt_pst1hr',
 'rel_hum',
 'time',
 'stn_pres',
 'station_id',
 'network_id']

In [None]:
# # Note: MemoryError/kernel crashed when attempting to import entire csv at once. Modify approach to limit how much of the csv is imported.
# MoTIe_data = functions.to_df('MoTIe_data_clean.csv')

In [8]:
# limit which columns are included in the read_csv function in order to combat memory error
MoTIe_data = pd.read_csv(f'{path_2}/MoTIe_data_clean.csv', usecols=['time', 'actual_wind_speed', 'measured_wind_speed1', 'current_air_temperature1', 'current_air_temperature2', 'air_temp', 'rel_hum', 'relative_humidity1', 'atmospheric_pressure', 'stn_pres', 'station_id', 'network_id'])

In [9]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MoTIe_data)

In [10]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MoTIe_data.shape

column names: ['relative_humidity1', 'measured_wind_speed1', 'atmospheric_pressure', 'current_air_temperature2', 'current_air_temperature1', 'actual_wind_speed', 'air_temp', 'rel_hum', 'time', 'stn_pres', 'station_id', 'network_id']
column nulls: relative_humidity1           6415149
measured_wind_speed1         7597565
atmospheric_pressure        11195679
current_air_temperature2    17585743
current_air_temperature1     1047258
actual_wind_speed           17764947
air_temp                    19147710
rel_hum                     19712282
time                               0
stn_pres                    20692645
station_id                         0
network_id                         0
dtype: int64
duplicate rows: Index([], dtype='int64')


(23833427, 12)

In [11]:
# only keep variables of interest
# no solar data available from this network
# columns were already filtered when reading csv, rename df for naming consistency
MoTIe_data_wind = MoTIe_data.loc[:,('relative_humidity1', 'measured_wind_speed1', 'atmospheric_pressure', 'current_air_temperature2', 'current_air_temperature1', 'actual_wind_speed', 'air_temp', 'rel_hum', 'time', 'stn_pres', 'station_id', 'network_id')]

In [12]:
# count null values in wind data
MoTIe_data_wind.isna().sum()

relative_humidity1           6415149
measured_wind_speed1         7597565
atmospheric_pressure        11195679
current_air_temperature2    17585743
current_air_temperature1     1047258
actual_wind_speed           17764947
air_temp                    19147710
rel_hum                     19712282
time                               0
stn_pres                    20692645
station_id                         0
network_id                         0
dtype: int64

In [71]:
# drop rows in wind data that do not have WindSpeed values
MoTIe_data_wind_ws_filter = MoTIe_data_wind.dropna(subset=['actual_wind_speed', 'measured_wind_speed1'], how='all')
MoTIe_data_wind_ws_filter.isna().sum()

relative_humidity1           1658426
measured_wind_speed1            5861
atmospheric_pressure         5535077
current_air_temperature2    10958332
current_air_temperature1      121686
actual_wind_speed           10173243
air_temp                    13311538
rel_hum                     13492143
time                               0
stn_pres                    13944879
station_id                         0
network_id                         0
dtype: int64

Compare values between columns that are measuring the same variable

In [87]:
# check whether columns measuring the same value have any overlap

# wind speed
# create a mask where column with the fewest nulls has no null values
not_null_value = MoTIe_data_wind_ws_filter['measured_wind_speed1'].notnull()
not_null_value_df = MoTIe_data_wind_ws_filter[not_null_value]

# check for rows where both columns contain a value
print(not_null_value_df['actual_wind_speed'].notnull().sum())

6062619


In [76]:
# air temperature pt 1
# create a mask where column with the fewest nulls has no null values
not_null_value = MoTIe_data_wind_ws_filter['current_air_temperature1'].notnull()
not_null_value_df = MoTIe_data_wind_ws_filter[not_null_value]

# check for rows where both columns contain a value
print(not_null_value_df['current_air_temperature2'].notnull().sum())
print(not_null_value_df['air_temp'].notnull().sum())



5262748
2909536


In [77]:
# air temperature pt 2
# create a mask where column with the fewest nulls has no null values
not_null_value = MoTIe_data_wind_ws_filter['current_air_temperature2'].notnull()
not_null_value_df = MoTIe_data_wind_ws_filter[not_null_value]

# check for rows where both columns contain a value
print(not_null_value_df['current_air_temperature1'].notnull().sum())
print(not_null_value_df['air_temp'].notnull().sum())



5262748
0


In [86]:
# relative humidity
# create a mask where column with the fewest nulls has no null values
not_null_value = MoTIe_data_wind_ws_filter['relative_humidity1'].notnull()
not_null_value_df = MoTIe_data_wind_ws_filter[not_null_value]

# check for rows where both columns contain a value
print(not_null_value_df['rel_hum'].notnull().sum())



2554275


In [88]:
# air pressure
# create a mask where column with the fewest nulls has no null values
not_null_value = MoTIe_data_wind_ws_filter['atmospheric_pressure'].notnull()
not_null_value_df = MoTIe_data_wind_ws_filter[not_null_value]

# check for rows where both columns contain a value
print(not_null_value_df['stn_pres'].notnull().sum())



2043249


In [89]:
# narrow dataset down to one column for each variable
MoTIe_data_wind_clean = MoTIe_data_wind_ws_filter.loc[:, ('time', "actual_wind_speed", "current_air_temperature1", "relative_humidity1", "atmospheric_pressure", 'station_id', 'network_id')]
# rationale:
#   actual_wind_speed is a point measurement, measured_wind_speed1 is an average measurement
#   all other variables had overlap
#   kept the variable that contained the fewest null values

In [90]:
# check values and units of wind speed columns
MoTIe_data_wind_clean.head(15)

Unnamed: 0,time,actual_wind_speed,current_air_temperature1,relative_humidity1,atmospheric_pressure,station_id,network_id
0,2001-01-22 13:00:00,,2.2,76.0,963.0,11091,MoTIe
1,2001-01-22 14:00:00,,1.9,76.0,960.0,11091,MoTIe
2,2001-01-22 15:00:00,,1.4,77.0,960.0,11091,MoTIe
3,2001-01-22 16:00:00,,0.4,82.0,960.0,11091,MoTIe
4,2001-01-22 17:00:00,,-0.5,88.0,960.0,11091,MoTIe
5,2001-01-22 18:00:00,,-1.1,93.0,960.0,11091,MoTIe
6,2001-01-22 19:00:00,,-1.2,94.0,960.0,11091,MoTIe
7,2001-01-22 20:00:00,,-1.6,95.0,960.0,11091,MoTIe
8,2001-01-22 21:00:00,,-1.8,95.0,960.0,11091,MoTIe
9,2001-01-22 22:00:00,,-1.9,95.0,960.0,11091,MoTIe


In [91]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MoTIe_data_wind_clean.loc[:,('actual_wind_speed')] = MoTIe_data_wind_clean.loc[:,('actual_wind_speed')].div(3.6)
MoTIe_data_wind_clean.head(15)

Unnamed: 0,time,actual_wind_speed,current_air_temperature1,relative_humidity1,atmospheric_pressure,station_id,network_id
0,2001-01-22 13:00:00,,2.2,76.0,963.0,11091,MoTIe
1,2001-01-22 14:00:00,,1.9,76.0,960.0,11091,MoTIe
2,2001-01-22 15:00:00,,1.4,77.0,960.0,11091,MoTIe
3,2001-01-22 16:00:00,,0.4,82.0,960.0,11091,MoTIe
4,2001-01-22 17:00:00,,-0.5,88.0,960.0,11091,MoTIe
5,2001-01-22 18:00:00,,-1.1,93.0,960.0,11091,MoTIe
6,2001-01-22 19:00:00,,-1.2,94.0,960.0,11091,MoTIe
7,2001-01-22 20:00:00,,-1.6,95.0,960.0,11091,MoTIe
8,2001-01-22 21:00:00,,-1.8,95.0,960.0,11091,MoTIe
9,2001-01-22 22:00:00,,-1.9,95.0,960.0,11091,MoTIe


In [92]:
# rename columns
MoTIe_wind = MoTIe_data_wind_clean.rename(columns={"actual_wind_speed": "wind_speed", "current_air_temperature1": "air_temperature", "atmospheric_pressure": "air_pressure", "relative_humidity1": "relative_humidity"})

In [93]:
MoTIe_wind.head(10)

Unnamed: 0,time,wind_speed,air_temperature,relative_humidity,air_pressure,station_id,network_id
0,2001-01-22 13:00:00,,2.2,76.0,963.0,11091,MoTIe
1,2001-01-22 14:00:00,,1.9,76.0,960.0,11091,MoTIe
2,2001-01-22 15:00:00,,1.4,77.0,960.0,11091,MoTIe
3,2001-01-22 16:00:00,,0.4,82.0,960.0,11091,MoTIe
4,2001-01-22 17:00:00,,-0.5,88.0,960.0,11091,MoTIe
5,2001-01-22 18:00:00,,-1.1,93.0,960.0,11091,MoTIe
6,2001-01-22 19:00:00,,-1.2,94.0,960.0,11091,MoTIe
7,2001-01-22 20:00:00,,-1.6,95.0,960.0,11091,MoTIe
8,2001-01-22 21:00:00,,-1.8,95.0,960.0,11091,MoTIe
9,2001-01-22 22:00:00,,-1.9,95.0,960.0,11091,MoTIe


In [94]:
# save to csv
MoTIe_wind.to_csv(f'{path_2}/MoTIe_wind.csv', index=False)

#### MoTIm Data Cleaning

In [24]:
MoTIm_data = functions.to_df('MoTIm_data_clean.csv')

In [25]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MoTIm_data)

In [26]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MoTIm_data.shape

column names: ['current_air_temperature1', 'relative_humidity1', 'storm_snow', 'standard_snow', 'minimum_air_temperature', 'maximum_air_temperature', 'time', 'precipitation_new', 'height_of_snow', 'station_id', 'network_id', 'interval_snow', 'measured_wind_speed1', 'measured_wind_direction1']
column nulls: current_air_temperature1      9742
relative_humidity1          409244
storm_snow                  466916
standard_snow               208950
minimum_air_temperature      32973
maximum_air_temperature      33700
time                             0
precipitation_new           432471
height_of_snow              106445
station_id                       0
network_id                       0
interval_snow               685431
measured_wind_speed1        746336
measured_wind_direction1    763853
dtype: int64
duplicate rows: Index([], dtype='int64')


(769279, 14)

In [27]:
# only keep variables of interest
# no solar data available from this network
MoTIm_data_wind = MoTIm_data.loc[:,('time', 'measured_wind_speed1', 'current_air_temperature1', 'relative_humidity1', 'station_id', 'network_id')] 
    # note that no air pressure value is available

In [28]:
# count null values in wind data
MoTIm_data_wind.isna().sum()

time                             0
measured_wind_speed1        746336
current_air_temperature1      9742
relative_humidity1          409244
station_id                       0
network_id                       0
dtype: int64

In [29]:
# drop rows in wind data that do not have WindSpeed values
MoTIm_data_wind_clean = MoTIm_data_wind.dropna(subset=['measured_wind_speed1'])
MoTIm_data_wind_clean.isna().sum()

time                          0
measured_wind_speed1          0
current_air_temperature1    111
relative_humidity1          392
station_id                    0
network_id                    0
dtype: int64

In [31]:
MoTIm_data_wind_clean['measured_wind_speed1'].head(15)

5667     5.0
6865     5.0
6867     5.0
6872     1.0
6875     1.0
6881     5.0
6889     5.0
6893     5.0
7318     5.0
7326     2.0
7327     2.0
7329    12.0
7340    30.0
7343     2.0
7347     5.0
Name: measured_wind_speed1, dtype: float64

In [33]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MoTIm_data_wind_clean.loc[:,'measured_wind_speed1'] = MoTIm_data_wind_clean.loc[:,'measured_wind_speed1'].div(3.6)
MoTIm_data_wind_clean.head(5)

Unnamed: 0,time,measured_wind_speed1,current_air_temperature1,relative_humidity1,station_id,network_id
5667,1980-01-30 05:30:00,0.385802,11.5,75.0,11101,MoTIm
6865,1986-03-09 16:00:00,0.385802,10.0,63.0,11101,MoTIm
6867,1986-03-10 16:00:00,0.385802,11.0,61.0,11101,MoTIm
6872,1986-03-13 06:00:00,0.07716,5.0,75.0,11101,MoTIm
6875,1986-03-14 16:00:00,0.07716,8.5,60.0,11101,MoTIm


In [35]:
# rename columns
MoTIm_wind = MoTIm_data_wind_clean.rename(columns={"measured_wind_speed1": "wind_speed", "current_air_temperature1": "air_temperature", "relative_humidity1": "relative_humidity", })

In [36]:
# save to csv
MoTIm_wind.to_csv(f'{path_2}/MoTIm_wind.csv', index=False)

#### MVan Data Cleaning

In [38]:
MVan_data = functions.to_df('MVan_data_clean.csv')

In [39]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MVan_data)

In [40]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MVan_data.shape

column names: ['wdir_vect', 'time', 'wspd_sclr', 'humidity', 'station_id', 'network_id']
column nulls: wdir_vect       6022
time               0
wspd_sclr     157014
humidity      104506
station_id         0
network_id         0
dtype: int64
duplicate rows: Index([], dtype='int64')


(283907, 6)

In [41]:
# only keep variables of interest
# no solar data available from this network
MVan_data_wind = MVan_data.loc[:,('time', 'wspd_sclr', 'humidity', 'station_id', 'network_id')] 
    # note that no air pressure or air temperature values are available

In [42]:
# count null values in wind data
MVan_data_wind.isna().sum()

time               0
wspd_sclr     157014
humidity      104506
station_id         0
network_id         0
dtype: int64

In [43]:
# drop rows in wind data that do not have WindSpeed values
MVan_data_wind_clean = MVan_data_wind.dropna(subset=['wspd_sclr'])
MVan_data_wind_clean.isna().sum()

time              0
wspd_sclr         0
humidity      56876
station_id        0
network_id        0
dtype: int64

In [44]:
MVan_data_wind_clean['wspd_sclr'].head(5)

0    1.38889
1    3.33334
2    1.66667
3    2.22222
4    1.94445
Name: wspd_sclr, dtype: float64

In [45]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MVan_data_wind_clean.loc[:,'wspd_sclr'] = MVan_data_wind_clean.loc[:,'wspd_sclr'].div(3.6)
MVan_data_wind_clean.head(5)

Unnamed: 0,time,wspd_sclr,humidity,station_id,network_id
0,2000-01-01 00:00:00,0.385803,,T12,MVan
1,2000-01-01 01:00:00,0.925928,,T12,MVan
2,2000-01-01 02:00:00,0.462964,,T12,MVan
3,2000-01-01 03:00:00,0.617283,,T12,MVan
4,2000-01-01 04:00:00,0.540125,,T12,MVan


In [46]:
# rename colums
MVan_wind = MVan_data_wind_clean.rename(columns={"wspd_sclr": "wind_speed", "humidity": "relative_humidity"})

In [47]:
# save to csv
MVan_wind.to_csv(f'{path_2}/MVan_wind.csv', index=False)

#### UNBC_CAM Data Cleaning

In [48]:
UNBC_CAM_data = functions.to_df('UNBC_CAM_data_clean.csv')

In [49]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(UNBC_CAM_data)

In [50]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
UNBC_CAM_data.shape

column names: ['rain_tot', 'solar_avg', 'solar_std', 'ws_std', 'airtc', 'time', 'rh', 'winddir_avg', 'dbtcdt', 'station_id', 'network_id', 'pressure', 'winddir_std', 'ws_avg', 'k_dn_avg', 'l_dn_avg', 'l_up_avg', 'k_up_avg']
column nulls: rain_tot        693656
solar_avg      1949847
solar_std      2097928
ws_std          274907
airtc           178388
time                 0
rh              145556
winddir_avg     780710
dbtcdt          307674
station_id           0
network_id           0
pressure        811623
winddir_std    2392832
ws_avg         2045467
k_dn_avg       2305685
l_dn_avg       2305685
l_up_avg       2305685
k_up_avg       2305685
dtype: int64
duplicate rows: Index([], dtype='int64')


(2634998, 18)

In [51]:
# only keep variables of interest
UNBC_CAM_data_solar = UNBC_CAM_data.loc[:,('time', 'solar_avg', 'station_id', 'network_id')]
UNBC_CAM_data_wind = UNBC_CAM_data.loc[:,('time', 'ws_avg', 'airtc', 'rh', 'pressure', 'station_id', 'network_id')]

In [52]:
# count null values in solar data
UNBC_CAM_data_solar.isna().sum()


time                0
solar_avg     1949847
station_id          0
network_id          0
dtype: int64

In [53]:
# compare with overall size of solar data
UNBC_CAM_data_solar.shape

(2634998, 4)

In [55]:
# drop rows in solar data that do not have solar radiation values
UNBC_CAM_data_solar_clean = UNBC_CAM_data_solar.dropna(subset=['solar_avg'])
UNBC_CAM_data_solar_clean.isna().sum()

time          0
solar_avg     0
station_id    0
network_id    0
dtype: int64

In [57]:
# rename columns
UNBC_CAM_solar = UNBC_CAM_data_solar_clean.rename(columns={"solar_avg": "solar_radiation"})
UNBC_CAM_solar

Unnamed: 0,time,solar_radiation,station_id,network_id
0,2010-01-01 00:00:00,0.00,ancifore,UNBC_CAM
1,2010-01-01 00:15:00,0.00,ancifore,UNBC_CAM
2,2010-01-01 00:29:59,0.00,ancifore,UNBC_CAM
3,2010-01-01 00:45:00,0.00,ancifore,UNBC_CAM
4,2010-01-01 01:00:00,0.00,ancifore,UNBC_CAM
...,...,...,...,...
2279813,2017-11-17 13:45:00,17.93,tatulake,UNBC_CAM
2279814,2017-11-17 14:00:00,20.04,tatulake,UNBC_CAM
2279815,2017-11-17 14:15:00,19.51,tatulake,UNBC_CAM
2279816,2017-11-17 14:30:00,26.36,tatulake,UNBC_CAM


In [58]:
UNBC_CAM_solar.to_csv(f'{path_2}/UNBC_CAM_solar.csv', index=False)

In [59]:
# count null values in wind data
UNBC_CAM_data_wind.isna().sum()

time                0
ws_avg        2045467
airtc          178388
rh             145556
pressure       811623
station_id          0
network_id          0
dtype: int64

In [60]:
# drop rows in wind data that do not have WindSpeed values
UNBC_CAM_data_wind_clean = UNBC_CAM_data_wind.dropna(subset=['ws_avg'])
UNBC_CAM_data_wind_clean.isna().sum()

time              0
ws_avg            0
airtc         17648
rh                0
pressure      18052
station_id        0
network_id        0
dtype: int64

In [61]:
# rename columns
UNBC_CAM_wind = UNBC_CAM_data_wind_clean.rename(columns={"ws_avg": "wind_speed", "airtc": "air_temperature", "rh": "relative_humidity", "pressure": "air_pressure"})
UNBC_CAM_wind

Unnamed: 0,time,wind_speed,air_temperature,relative_humidity,air_pressure,station_id,network_id
748750,2010-01-01 00:00:00,6.214,-8.130,94.3,805.188,lowecast,UNBC_CAM
748751,2010-01-01 00:15:00,5.291,-8.140,93.6,805.183,lowecast,UNBC_CAM
748752,2010-01-01 00:29:59,5.138,-7.990,92.8,804.634,lowecast,UNBC_CAM
748753,2010-01-01 00:45:00,6.429,-7.940,93.5,804.569,lowecast,UNBC_CAM
748754,2010-01-01 01:00:00,6.987,-7.771,92.3,804.291,lowecast,UNBC_CAM
...,...,...,...,...,...,...,...
1860321,2017-12-31 22:45:00,0.000,-15.730,90.5,943.047,qrrc,UNBC_CAM
1860322,2017-12-31 23:00:00,0.000,-15.780,90.8,943.026,qrrc,UNBC_CAM
1860323,2017-12-31 23:15:00,0.000,-15.660,90.8,943.167,qrrc,UNBC_CAM
1860324,2017-12-31 23:30:00,0.000,-15.430,91.3,943.272,qrrc,UNBC_CAM


In [62]:
# save to csv
UNBC_CAM_wind.to_csv(f'{path_2}/UNBC_CAM_wind.csv', index=False)

### Confirm that all column names are standardized

In [63]:
# Create list of wind and solar filenames

keywords = ['wind', 'solar']
# create list to store wind records
wind_files = []
# create list to store solar records
solar_files = []
# create list of files in data folder
file_names = os.listdir(f"{path_2}")
# iterate through files in data folder
for file_name in file_names:
    # match against keywords
    if keywords[0] in file_name:
        wind_files.append(file_name)
    elif keywords[1] in file_name:
        solar_files.append(file_name)
    else: continue

print(f"solar files: {solar_files}")
print(f"wind files: {wind_files}")

solar files: ['BCH_solar.csv', 'CRD_solar.csv', 'FLNRO_FERN_solar.csv', 'UNBC_CAM_solar.csv']
wind files: ['BCH_wind.csv', 'CRD_wind.csv', 'EC_raw_wind.csv', 'ENV_AQN_wind.csv', 'FLNRO_FERN_wind.csv', 'MoTIe_wind.csv', 'MoTIm_wind.csv', 'MVan_wind.csv', 'UNBC_CAM_wind.csv']


In [64]:
# print list of column names for each file
for file in solar_files:
    print(f"{file}")
    print(list(pd.read_csv(f'{path_2}/{file}', index_col=0, nrows=0)))


BCH_solar.csv
['solar_radiation', 'station_id', 'network_id']
CRD_solar.csv
['solar_radiation', 'station_id', 'network_id']
FLNRO_FERN_solar.csv
['solar_radiation', 'station_id', 'network_id']
UNBC_CAM_solar.csv
['solar_radiation', 'station_id', 'network_id']


In [95]:
# print list of column names for each file
for file in wind_files:
    print(f"{file}")
    print(list(pd.read_csv(f'{path_2}/{file}', index_col=0, nrows=0)))

BCH_wind.csv
['wind_speed', 'air_pressure', 'air_temperature', 'relative_humidity', 'station_id', 'network_id']
CRD_wind.csv
['wind_speed', 'air_temperature', 'relative_humidity', 'station_id', 'network_id']
EC_raw_wind.csv
['wind_speed', 'air_temperature', 'relative_humidity', 'station_id', 'network_id']
ENV_AQN_wind.csv
['wind_speed', 'air_temperature', 'relative_humidity', 'air_pressure', 'station_id', 'network_id']
FLNRO_FERN_wind.csv
['wind_speed', 'air_temperature', 'relative_humidity', 'air_pressure', 'station_id', 'network_id']
MoTIe_wind.csv
['wind_speed', 'air_temperature', 'relative_humidity', 'air_pressure', 'station_id', 'network_id']
MoTIm_wind.csv
['wind_speed', 'air_temperature', 'relative_humidity', 'station_id', 'network_id']
MVan_wind.csv
['wind_speed', 'relative_humidity', 'station_id', 'network_id']
UNBC_CAM_wind.csv
['wind_speed', 'air_temperature', 'relative_humidity', 'air_pressure', 'station_id', 'network_id']
