### Initializing

In [1]:
import pandas as pd
import os 
import functions

In [2]:
# test for proper access to functions module
functions.temp()

hello


In [3]:
# define path variable for data folder containing concatenated .csv files
path_2 = '../data'

### Data cleaning
Objectives:
- remove irrelevant/null columns
- keep columns variables associated with wind power generation
    - wind speed
    - air density
        - air pressure
        - air temperature
        - relative humidity
- keep column variables associated with solar power generation
    - solar radiation
    - cloud cover fraction?
    - sunlight duration?
- standardize column data with common variables (units, format)

#### BCH Data Cleaning

In [4]:
functions.list_columns('BCH_data.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv', index_col=0)


['WindSpeed',
 ' time',
 ' WindDirection',
 'filename',
 'L_down_corr_Avg',
 ' HFT3_1_Avg',
 ' L_up_Avg',
 ' WindSpeed',
 ' ONE_DAY_SNOW',
 ' K_up_Avg',
 ' L_up_corr_Avg',
 ' AirTemp',
 ' L_down_Avg',
 ' HFT3_2_Avg',
 ' K_down_Avg',
 ' BarometricPressure',
 ' SoilVolumetricWaterContent',
 ' RelativeHumidity',
 ' TSoil_Avg',
 ' Vis',
 ' ONE_DAY_RAIN',
 'AirTemp',
 ' ONE_DAY_PRECIPITATION',
 ' MIN_TEMP',
 ' MAX_TEMP',
 ' Snow_WE',
 ' SNOW_ON_THE_GROUND',
 'HFT3_1_Avg',
 ' NetRad',
 'time']

In [28]:
# only include variables of interest
BCH_data_wind = BCH_data.loc[:,('time', ' time', 'WindSpeed', ' BarometricPressure', 'AirTemp', ' RelativeHumidity', 'filename')]
BCH_data_solar = BCH_data.loc[:,('time', ' time', ' NetRad', 'filename')]

##### BCH Wind Data Cleaning

In [16]:
# count null values in wind data
BCH_data_wind.isna().sum()

time                   6813622
 time                   203217
WindSpeed              6988625
 BarometricPressure    6833815
AirTemp                1496022
 RelativeHumidity      6827106
filename                     0
dtype: int64

In [29]:
# drop rows in wind data that do not have a WindSpeed value
# WindSpeed is a key variable that is important to location-specific results
BCH_data_wind_clean = BCH_data_wind.dropna(subset=['WindSpeed'])
BCH_data_wind_clean.isna().sum()

time                   28214
 time                      0
WindSpeed                  0
 BarometricPressure    28214
AirTemp                28214
 RelativeHumidity      28214
filename                   0
dtype: int64

In [36]:
# check whether number of remaining nulls are the same as the number of rows
BCH_data_wind_clean.shape

(28214, 7)

In [35]:
# check for duplicates
duplicate_rows = BCH_data_wind_clean.duplicated()
duplicate_rows[duplicate_rows].index

Int64Index([], dtype='int64')

In [38]:
# save to csv
BCH_data_wind_clean.to_csv('../data/BCH_data_wind_clean.csv')

Plan for handling remaining nulls:
- use values from next nearest weather station
- use averages from data as a whole if necessary

##### BCH Solar Data Cleaning

In [17]:
# count null values in solar data
BCH_data_solar.isna().sum()

time        6813622
 time        203217
 NetRad     6928912
filename          0
dtype: int64

In [18]:
# drop rows in solar data that do not have a NetRad value
BCH_data_solar_clean = BCH_data_solar.dropna(subset=[' NetRad'])
BCH_data_solar_clean.isna().sum()

time        87927
 time           0
 NetRad         0
filename        0
dtype: int64

In [24]:
# check whether number of null 'time' values is the same as number of rows
BCH_data_solar_clean.shape

(87927, 4)

In [39]:
# check for duplicates
duplicate_rows = BCH_data_solar_clean.duplicated()
duplicate_rows[duplicate_rows].index

Int64Index([], dtype='int64')

In [25]:
# drop 'time'
BCH_data_solar_clean.drop(columns=['time'], inplace=True)
BCH_data_solar_clean.shape

(87927, 3)

In [26]:
# save BCH solar data to csv
BCH_data_solar_clean.to_csv('../data/BCH_data_solar_clean.csv')

#### CRD Data Cleaning

In [7]:
CRD_data = functions.to_df('CRD_data.csv')

In [8]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(CRD_data)

In [9]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
CRD_data.shape

column names: ['SolarRadiation', ' AirTemperature', ' Precipitation', ' Rain', ' WindSpeed', ' SnowDepth', ' time', ' WindDirection', ' RelativeHumidity', 'filename', 'AirTemperature', ' SnowWaterEquivalent', 'RelativeHumidity']
column nulls: SolarRadiation          4090779
 AirTemperature         2812394
 Precipitation          3328235
 Rain                    438757
 WindSpeed               655130
 SnowDepth              3711426
 time                         0
 WindDirection           658523
 RelativeHumidity        652576
filename                      0
AirTemperature          1468700
 SnowWaterEquivalent    4078683
RelativeHumidity        4039005
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


In [10]:
# only keep variables of interest
CRD_data_solar = CRD_data.loc[:,(' time', 'SolarRadiation', 'filename')]
CRD_data_wind = CRD_data.loc[:,(' time', ' WindSpeed', 'AirTemperature', ' AirTemperature', 'RelativeHumidity', ' RelativeHumidity', 'filename')] 
    # note that no air pressure value is available

In [11]:
# drop rows in solar data that do not have SolarRadiation values
CRD_data_solar_clean = CRD_data_solar.dropna(subset=['SolarRadiation'])
CRD_data_solar_clean.isna().sum()

 time             0
SolarRadiation    0
filename          0
dtype: int64

In [9]:
CRD_data_solar_clean.to_csv(f'{path_2}/CRD_data_solar_clean.csv')

In [12]:
# count null values in wind data
CRD_data_wind.isna().sum()

 time                      0
 WindSpeed            655130
AirTemperature       1468700
 AirTemperature      2812394
RelativeHumidity     4039005
 RelativeHumidity     652576
filename                   0
dtype: int64

In [21]:
# drop rows in wind data that do not have WindSpeed values
CRD_data_wind_clean = CRD_data_wind.dropna(subset=[' WindSpeed'])
CRD_data_wind_clean.isna().sum()

 time                      0
 WindSpeed                 0
AirTemperature        817039
 AirTemperature      2808351
RelativeHumidity     3445441
 RelativeHumidity       1181
filename                   0
dtype: int64

In [22]:
CRD_data_wind_clean[' WindSpeed'].head(15)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      3.8
10     3.9
11     0.0
12     6.6
13    10.9
14     8.4
Name:  WindSpeed, dtype: float64

In [23]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
CRD_data_wind_clean.loc[:,' WindSpeed'] = CRD_data_wind_clean.loc[:,' WindSpeed'].div(3.6)
CRD_data_wind_clean.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CRD_data_wind_clean.loc[:,' WindSpeed'] = CRD_data_wind_clean.loc[:,' WindSpeed'].div(3.6)


Unnamed: 0,time,WindSpeed,AirTemperature,AirTemperature.1,RelativeHumidity,RelativeHumidity.1,filename
0,1998-04-17 00:00:00,0.0,,0.9,,100.0,FW001.csv
1,1998-04-17 01:00:00,0.0,,1.2,,100.0,FW001.csv
2,1998-04-17 02:00:00,0.0,,0.7,,100.0,FW001.csv
3,1998-04-17 03:00:00,0.0,,0.9,,100.0,FW001.csv
4,1998-04-17 04:00:00,0.0,,0.1,,100.0,FW001.csv
5,1998-04-17 05:00:00,0.0,,0.1,,100.0,FW001.csv
6,1998-04-17 06:00:00,0.0,,0.4,,100.0,FW001.csv
7,1998-04-17 07:00:00,0.0,,2.7,,100.0,FW001.csv
8,1998-04-17 08:00:00,0.0,,8.2,,74.0,FW001.csv
9,1998-04-17 09:00:00,1.055556,,9.4,,64.0,FW001.csv


In [24]:
# save to csv
CRD_data_wind_clean.to_csv(f'{path_2}/CRD_data_wind_clean.csv')

#### EC_raw Data Cleaning

In [3]:
EC_raw_data = functions.to_df('EC_raw_data.csv')

In [4]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(EC_raw_data)

In [5]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['wind_speed', ' mean_sea_level', ' dew_point', ' wind_direction', ' relative_humidity', ' air_temperature', ' time', ' air_temperature_yesterday_low', ' wind_gust_speed', ' air_temperature_yesterday_high', ' tendency_amount', 'filename', ' total_precipitation', 'mean_sea_level', 'wind_direction', ' wind_speed', ' snow_amount', ' total_cloud_cover', 'relative_humidity', 'dew_point']
column nulls: wind_speed                          6006263
 mean_sea_level                     1709535
 dew_point                          1035783
 wind_direction                     6420851
 relative_humidity                  1027746
 air_temperature                     411231
 time                                     0
 air_temperature_yesterday_low     12391030
 wind_gust_speed                   11440570
 air_temperature_yesterday_high    12390760
 tendency_amount                    2839650
filename                                  0
 total_precipitation               12295547
mean_sea_level

In [25]:
# only keep variables of interest
# no solar data available from this network
EC_raw_data_wind = EC_raw_data.loc[:,(' time', 'wind_speed', ' wind_speed', ' air_temperature', 'relative_humidity', ' relative_humidity', 'filename')] 
    # note that no air pressure column is available

In [26]:
# count null values in wind data
EC_raw_data_wind.isna().sum()

 time                        0
wind_speed             6006263
 wind_speed            7785805
 air_temperature        411231
relative_humidity     12687542
 relative_humidity     1027746
filename                     0
dtype: int64

In [27]:
# drop rows in wind data that do not have values in either windspeed column
EC_raw_data_wind_clean = EC_raw_data_wind.dropna(subset=['wind_speed', ' wind_speed'], how='all')
EC_raw_data_wind_clean.isna().sum()

 time                        0
wind_speed             4993828
 wind_speed            6773370
 air_temperature        331537
relative_humidity     11767198
 relative_humidity      802170
filename                     0
dtype: int64

In [29]:
EC_raw_data_wind_clean.head(15)

Unnamed: 0,time,wind_speed,wind_speed.1,air_temperature,relative_humidity,relative_humidity.1,filename
138,2011-10-16 00:00:00,16.9,,9.9,,78.0,1012475.csv
139,2011-10-16 01:00:00,16.2,,9.9,,78.0,1012475.csv
140,2011-10-16 02:00:00,10.4,,9.7,,82.0,1012475.csv
141,2011-10-16 03:00:00,9.4,,9.3,,83.0,1012475.csv
142,2011-10-16 04:00:00,13.3,,9.1,,83.0,1012475.csv
143,2011-10-16 05:00:00,15.5,,9.1,,84.0,1012475.csv
144,2011-10-16 06:00:00,17.3,,8.8,,84.0,1012475.csv
145,2011-10-16 07:00:00,16.9,,8.8,,83.0,1012475.csv
146,2011-10-16 08:00:00,16.2,,8.4,,81.0,1012475.csv
147,2011-10-16 09:00:00,8.3,,7.7,,83.0,1012475.csv


In [30]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')] = EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')].div(3.6)
EC_raw_data_wind_clean.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')] = EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')].div(3.6)


Unnamed: 0,time,wind_speed,wind_speed.1,air_temperature,relative_humidity,relative_humidity.1,filename
138,2011-10-16 00:00:00,4.694444,,9.9,,78.0,1012475.csv
139,2011-10-16 01:00:00,4.5,,9.9,,78.0,1012475.csv
140,2011-10-16 02:00:00,2.888889,,9.7,,82.0,1012475.csv
141,2011-10-16 03:00:00,2.611111,,9.3,,83.0,1012475.csv
142,2011-10-16 04:00:00,3.694444,,9.1,,83.0,1012475.csv
143,2011-10-16 05:00:00,4.305556,,9.1,,84.0,1012475.csv
144,2011-10-16 06:00:00,4.805556,,8.8,,84.0,1012475.csv
145,2011-10-16 07:00:00,4.694444,,8.8,,83.0,1012475.csv
146,2011-10-16 08:00:00,4.5,,8.4,,81.0,1012475.csv
147,2011-10-16 09:00:00,2.305556,,7.7,,83.0,1012475.csv


In [31]:
# save to csv
EC_raw_data_wind_clean.to_csv(f'{path_2}/EC_raw_data_wind_clean.csv')

#### ENV_AQN Data Cleaning

In [32]:
ENV_AQN_data = functions.to_df('ENV_AQN_data.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv', index_col=0)


In [33]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(ENV_AQN_data)

In [34]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['WDIR_VECT', ' TEMP_MEAN', ' WSPD_SCLR', ' PRECIP_TOTAL', ' avg_rel_hum_pst1hr', ' time', ' HUMIDITY', ' avg_air_temp_pst1hr', 'filename', ' BAR_PRESS', 'time', 'TEMP_MEAN', 'BAR_PRESS']
column nulls: WDIR_VECT                6568691
 TEMP_MEAN               5436399
 WSPD_SCLR               2296217
 PRECIP_TOTAL           12783840
 avg_rel_hum_pst1hr      8968906
 time                     906219
 HUMIDITY                8688631
 avg_air_temp_pst1hr     7985418
filename                       0
 BAR_PRESS              12978729
time                    12163662
TEMP_MEAN               13046707
BAR_PRESS               13054118
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


In [35]:
# only keep variables of interest
# no solar data available for this network
ENV_AQN_data_wind = ENV_AQN_data.loc[:,(' time', 'time', ' WSPD_SCLR', ' TEMP_MEAN', ' HUMIDITY', ' BAR_PRESS', 'filename')] 

In [36]:
# count null values in wind data
ENV_AQN_data_wind.isna().sum()

 time           906219
time          12163662
 WSPD_SCLR     2296217
 TEMP_MEAN     5436399
 HUMIDITY      8688631
 BAR_PRESS    12978729
filename             0
dtype: int64

In [37]:
# drop rows in wind data that do not have WindSpeed values
ENV_AQN_data_wind_clean = ENV_AQN_data_wind.dropna(subset=[' WSPD_SCLR'])
ENV_AQN_data_wind_clean.isna().sum()

 time           827218
time           9946446
 WSPD_SCLR           0
 TEMP_MEAN     3776367
 HUMIDITY      7892770
 BAR_PRESS    10696845
filename             0
dtype: int64

In [38]:
ENV_AQN_data_wind_clean[' WSPD_SCLR'].head(15)

0     1.044
1     2.518
2     1.028
3     0.704
4     0.208
5     0.140
6     0.534
7     0.457
8     0.916
9     0.303
10    0.403
11    0.378
12    0.750
13    0.576
14    0.224
Name:  WSPD_SCLR, dtype: float64

*Note: no units were provided for wind speed column. Based on range of values, assumed that units were already m/s.*

*Only use next code block if data appears to be incorrect during analysis stage.*

In [None]:
# # convert windspeed values from km/h to m/s by dividing values by 3.6
# ENV_AQN_data_wind_clean.loc[:,' WSPD_SCLR'] = ENV_AQN_data_wind_clean.loc[:,' WSPD_SCLR'].div(3.6)
# ENV_AQN_data_wind_clean.head(15)

In [40]:
# save to csv
ENV_AQN_data_wind_clean.to_csv(f'{path_2}/ENV_AQN_data_wind_clean.csv')

#### FLNRO_FERN Data Cleaning

In [41]:
FLNRO_FERN_data = functions.to_df('FLNRO_FERN_data.csv')

In [42]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(FLNRO_FERN_data)

In [43]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['TempC', ' SolarRadiationWm', ' RH', ' WindSpeedms', ' Pressurembar', ' Rainmm', ' GustSpeedms', ' time', ' WindDirection', ' DewPtC', 'filename', ' Wetness', ' Tx', ' Tn', 'RHx', ' Tm', ' RHn', 'Tx']
column nulls: TempC                  78077
 SolarRadiationWm     155467
 RH                    76028
 WindSpeedms          159515
 Pressurembar         143098
 Rainmm               163976
 GustSpeedms          171360
 time                      0
 WindDirection        398990
 DewPtC               187998
filename                   0
 Wetness             1441326
 Tx                  2318932
 Tn                  2310515
RHx                  2331262
 Tm                  2319798
 RHn                 2331284
Tx                   2332311
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


In [45]:
# only keep variables of interest
FLNRO_FERN_data_solar = FLNRO_FERN_data.loc[:,(' time', ' SolarRadiationWm', 'filename')]
FLNRO_FERN_data_wind = FLNRO_FERN_data.loc[:,(' time', ' WindSpeedms', 'TempC', ' RH', ' Pressurembar', 'filename')]

In [46]:
# drop rows in solar data that do not have SolarRadiation values
FLNRO_FERN_data_solar_clean = FLNRO_FERN_data_solar.dropna(subset=[' SolarRadiationWm'])
FLNRO_FERN_data_solar_clean.isna().sum()

 time                0
 SolarRadiationWm    0
filename             0
dtype: int64

In [47]:
FLNRO_FERN_data_solar_clean.to_csv(f'{path_2}/FLNRO_FERN_data_solar_clean.csv')

In [48]:
# count null values in wind data
FLNRO_FERN_data_wind.isna().sum()

 time                 0
 WindSpeedms     159515
TempC             78077
 RH               76028
 Pressurembar    143098
filename              0
dtype: int64

In [49]:
# drop rows in wind data that do not have WindSpeed values
FLNRO_FERN_data_wind_clean = FLNRO_FERN_data_wind.dropna(subset=[' WindSpeedms'])
FLNRO_FERN_data_wind_clean.isna().sum()

 time                0
 WindSpeedms         0
TempC            53919
 RH              51069
 Pressurembar    53561
filename             0
dtype: int64

In [50]:
# save to csv
FLNRO_FERN_data_wind_clean.to_csv(f'{path_2}/FLNRO_FERN_data_wind_clean.csv')

#### MoTIe Data Cleaning

In [55]:
MoTIe_col_names = pd.read_csv(f'{path_2}/MoTIe_data.csv', index_col=0, nrows=0)

In [57]:
list(MoTIe_col_names)

['RELATIVE_HUMIDITY1',
 ' STANDARD_SNOW',
 ' MINIMUM_AIR_TEMPERATURE',
 ' max_wnd_spd_10m_pst1hr',
 ' MEASURED_WIND_SPEED1',
 ' MEASURED_WIND_DIRECTION1',
 ' dwpt_temp',
 ' min_air_temp_snc_last_reset',
 ' DEW_POINT',
 ' ATMOSPHERIC_PRESSURE',
 ' avg_wnd_dir_10m_pst10mts',
 ' ACTUAL_WIND_DIRECTION',
 ' MAXIMUM_AIR_TEMPERATURE',
 ' snw_dpth',
 ' MAXIMUM_MEASURED_WIND_SPEED1',
 ' HEIGHT_OF_SNOW',
 ' pcpn_amt_pst24hrs',
 ' CURRENT_AIR_TEMPERATURE2',
 ' CURRENT_AIR_TEMPERATURE1',
 ' PRECIPITATION_NEW',
 ' max_air_temp_snc_last_reset',
 ' PRECIP_DETECTOR_RATIO',
 ' ACTUAL_WIND_SPEED',
 ' WIND_DIRECTION_STD_DEVIATION1',
 ' avg_wnd_spd_10m_pst10mts',
 ' air_temp',
 ' mslp',
 ' pcpn_amt_pst1hr',
 ' snwfl_amt_pst1hr',
 ' rel_hum',
 ' time',
 ' stn_pres',
 'filename',
 'pcpn_amt_pst24hrs',
 'CURRENT_AIR_TEMPERATURE2',
 ' RELATIVE_HUMIDITY1',
 'air_temp',
 'CURRENT_AIR_TEMPERATURE1',
 'max_air_temp_snc_last_reset']

In [51]:
# Note: MemoryError when attempting to import entire csv at once. Modify approach to limit how much of the csv is imported.
#MoTIe_data = functions.to_df('MoTIe_data.csv')

MemoryError: Unable to allocate 128. KiB for an array with shape (16384,) and data type float64

In [13]:
# limit which columns are included in the read_csv function in order to combat memory error
MoTIe_data = pd.read_csv(f'{path_2}/MoTIe_data.csv', usecols=[' time', ' ACTUAL_WIND_SPEED', ' MEASURED_WIND_SPEED1', ' CURRENT_AIR_TEMPERATURE1', ' CURRENT_AIR_TEMPERATURE2', 'air_temp', ' rel_hum', ' RELATIVE_HUMIDITY1', ' ATMOSPHERIC_PRESSURE', ' stn_pres', 'filename'])

In [14]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MoTIe_data)

In [15]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MoTIe_data.shape

column names: [' MEASURED_WIND_SPEED1', ' ATMOSPHERIC_PRESSURE', ' CURRENT_AIR_TEMPERATURE2', ' CURRENT_AIR_TEMPERATURE1', ' ACTUAL_WIND_SPEED', ' rel_hum', ' time', ' stn_pres', 'filename', ' RELATIVE_HUMIDITY1', 'air_temp']
column nulls:  MEASURED_WIND_SPEED1         7597565
 ATMOSPHERIC_PRESSURE        11195679
 CURRENT_AIR_TEMPERATURE2    18406996
 CURRENT_AIR_TEMPERATURE1     1894433
 ACTUAL_WIND_SPEED           17764947
 rel_hum                     19712282
 time                               0
 stn_pres                    20692645
filename                            0
 RELATIVE_HUMIDITY1          18890888
air_temp                     23679158
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


(23833427, 11)

In [17]:
# only keep variables of interest
# no solar data available from this network
# columns were already filtered when reading csv, rename df for naming consistency
MoTIe_data_wind = MoTIe_data.loc[:,(' time', ' ACTUAL_WIND_SPEED', ' MEASURED_WIND_SPEED1', ' CURRENT_AIR_TEMPERATURE1', ' CURRENT_AIR_TEMPERATURE2', 'air_temp', ' rel_hum', ' RELATIVE_HUMIDITY1', ' ATMOSPHERIC_PRESSURE', ' stn_pres', 'filename')]

In [18]:
# count null values in wind data
MoTIe_data_wind.isna().sum()

 time                               0
 ACTUAL_WIND_SPEED           17764947
 MEASURED_WIND_SPEED1         7597565
 CURRENT_AIR_TEMPERATURE1     1894433
 CURRENT_AIR_TEMPERATURE2    18406996
air_temp                     23679158
 rel_hum                     19712282
 RELATIVE_HUMIDITY1          18890888
 ATMOSPHERIC_PRESSURE        11195679
 stn_pres                    20692645
filename                            0
dtype: int64

In [22]:
# drop rows in wind data that do not have WindSpeed values
MoTIe_data_wind_clean = MoTIe_data_wind.dropna(subset=[' ACTUAL_WIND_SPEED', ' MEASURED_WIND_SPEED1'], how='all')
MoTIe_data_wind_clean.isna().sum()

 time                               0
 ACTUAL_WIND_SPEED           10173243
 MEASURED_WIND_SPEED1            5861
 CURRENT_AIR_TEMPERATURE1      659463
 CURRENT_AIR_TEMPERATURE2    11587283
air_temp                     16241723
 rel_hum                     13492143
 RELATIVE_HUMIDITY1          13186268
 ATMOSPHERIC_PRESSURE         5535077
 stn_pres                    13944879
filename                            0
dtype: int64

In [23]:
# check values and units of wind speed columns
MoTIe_data_wind_clean.head(15)

Unnamed: 0,time,ACTUAL_WIND_SPEED,MEASURED_WIND_SPEED1,CURRENT_AIR_TEMPERATURE1,CURRENT_AIR_TEMPERATURE2,air_temp,rel_hum,RELATIVE_HUMIDITY1,ATMOSPHERIC_PRESSURE,stn_pres,filename
0,2001-01-22 13:00:00,,7.0,2.2,2.0,,,,963.0,,11091.csv
1,2001-01-22 14:00:00,,8.0,1.9,,,,,960.0,,11091.csv
2,2001-01-22 15:00:00,,8.0,1.4,,,,,960.0,,11091.csv
3,2001-01-22 16:00:00,,8.0,0.4,-1.0,,,,960.0,,11091.csv
4,2001-01-22 17:00:00,,10.0,-0.5,,,,,960.0,,11091.csv
5,2001-01-22 18:00:00,,9.0,-1.1,-2.5,,,,960.0,,11091.csv
6,2001-01-22 19:00:00,,8.0,-1.2,-2.5,,,,960.0,,11091.csv
7,2001-01-22 20:00:00,,9.0,-1.6,,,,,960.0,,11091.csv
8,2001-01-22 21:00:00,,9.0,-1.8,-3.0,,,,960.0,,11091.csv
9,2001-01-22 22:00:00,,10.0,-1.9,-3.0,,,,960.0,,11091.csv


In [24]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MoTIe_data_wind_clean.loc[:,(' ACTUAL_WIND_SPEED', ' MEASURED_WIND_SPEED1')] = MoTIe_data_wind_clean.loc[:,(' ACTUAL_WIND_SPEED', ' MEASURED_WIND_SPEED1')].div(3.6)
MoTIe_data_wind_clean.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MoTIe_data_wind_clean.loc[:,(' ACTUAL_WIND_SPEED', ' MEASURED_WIND_SPEED1')] = MoTIe_data_wind_clean.loc[:,(' ACTUAL_WIND_SPEED', ' MEASURED_WIND_SPEED1')].div(3.6)


Unnamed: 0,time,ACTUAL_WIND_SPEED,MEASURED_WIND_SPEED1,CURRENT_AIR_TEMPERATURE1,CURRENT_AIR_TEMPERATURE2,air_temp,rel_hum,RELATIVE_HUMIDITY1,ATMOSPHERIC_PRESSURE,stn_pres,filename
0,2001-01-22 13:00:00,,1.944444,2.2,2.0,,,,963.0,,11091.csv
1,2001-01-22 14:00:00,,2.222222,1.9,,,,,960.0,,11091.csv
2,2001-01-22 15:00:00,,2.222222,1.4,,,,,960.0,,11091.csv
3,2001-01-22 16:00:00,,2.222222,0.4,-1.0,,,,960.0,,11091.csv
4,2001-01-22 17:00:00,,2.777778,-0.5,,,,,960.0,,11091.csv
5,2001-01-22 18:00:00,,2.5,-1.1,-2.5,,,,960.0,,11091.csv
6,2001-01-22 19:00:00,,2.222222,-1.2,-2.5,,,,960.0,,11091.csv
7,2001-01-22 20:00:00,,2.5,-1.6,,,,,960.0,,11091.csv
8,2001-01-22 21:00:00,,2.5,-1.8,-3.0,,,,960.0,,11091.csv
9,2001-01-22 22:00:00,,2.777778,-1.9,-3.0,,,,960.0,,11091.csv


In [25]:
# save to csv
MoTIe_data_wind_clean.to_csv(f'{path_2}/MoTIe_data_wind_clean.csv')

#### MoTIm Data Cleaning

In [26]:
MoTIm_data = functions.to_df('MoTIm_data.csv')

In [27]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MoTIm_data)

In [28]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MoTIm_data.shape

column names: ['CURRENT_AIR_TEMPERATURE1', ' RELATIVE_HUMIDITY1', ' STORM_SNOW', ' STANDARD_SNOW', ' MINIMUM_AIR_TEMPERATURE', ' MAXIMUM_AIR_TEMPERATURE', ' time', ' PRECIPITATION_NEW', ' HEIGHT_OF_SNOW', 'filename', ' INTERVAL_SNOW', ' MEASURED_WIND_SPEED1', ' MEASURED_WIND_DIRECTION1']
column nulls: CURRENT_AIR_TEMPERATURE1       9742
 RELATIVE_HUMIDITY1          409244
 STORM_SNOW                  466916
 STANDARD_SNOW               208950
 MINIMUM_AIR_TEMPERATURE      32973
 MAXIMUM_AIR_TEMPERATURE      33700
 time                             0
 PRECIPITATION_NEW           432471
 HEIGHT_OF_SNOW              106445
filename                          0
 INTERVAL_SNOW               685431
 MEASURED_WIND_SPEED1        746336
 MEASURED_WIND_DIRECTION1    763853
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


(769279, 13)

In [29]:
# only keep variables of interest
# no solar data available from this network
MoTIm_data_wind = MoTIm_data.loc[:,(' time', ' MEASURED_WIND_SPEED1', 'CURRENT_AIR_TEMPERATURE1', ' RELATIVE_HUMIDITY1', 'filename')] 
    # note that no air pressure value is available

In [30]:
# count null values in wind data
MoTIm_data_wind.isna().sum()

 time                            0
 MEASURED_WIND_SPEED1       746336
CURRENT_AIR_TEMPERATURE1      9742
 RELATIVE_HUMIDITY1         409244
filename                         0
dtype: int64

In [31]:
# drop rows in wind data that do not have WindSpeed values
MoTIm_data_wind_clean = MoTIm_data_wind.dropna(subset=[' MEASURED_WIND_SPEED1'])
MoTIm_data_wind_clean.isna().sum()

 time                         0
 MEASURED_WIND_SPEED1         0
CURRENT_AIR_TEMPERATURE1    111
 RELATIVE_HUMIDITY1         392
filename                      0
dtype: int64

In [33]:
MoTIm_data_wind_clean[' MEASURED_WIND_SPEED1'].head(15)

5667     5.0
6865     5.0
6867     5.0
6872     1.0
6875     1.0
6881     5.0
6889     5.0
6893     5.0
7318     5.0
7326     2.0
7327     2.0
7329    12.0
7340    30.0
7343     2.0
7347     5.0
Name:  MEASURED_WIND_SPEED1, dtype: float64

In [34]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MoTIm_data_wind_clean.loc[:,' MEASURED_WIND_SPEED1'] = MoTIm_data_wind_clean.loc[:,' MEASURED_WIND_SPEED1'].div(3.6)
MoTIm_data_wind_clean.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MoTIm_data_wind_clean.loc[:,' MEASURED_WIND_SPEED1'] = MoTIm_data_wind_clean.loc[:,' MEASURED_WIND_SPEED1'].div(3.6)


Unnamed: 0,time,MEASURED_WIND_SPEED1,CURRENT_AIR_TEMPERATURE1,RELATIVE_HUMIDITY1,filename
5667,1980-01-30 05:30:00,1.388889,11.5,75.0,11101.csv
6865,1986-03-09 16:00:00,1.388889,10.0,63.0,11101.csv
6867,1986-03-10 16:00:00,1.388889,11.0,61.0,11101.csv
6872,1986-03-13 06:00:00,0.277778,5.0,75.0,11101.csv
6875,1986-03-14 16:00:00,0.277778,8.5,60.0,11101.csv
6881,1986-03-17 16:45:00,1.388889,9.0,61.0,11101.csv
6889,1986-03-21 16:30:00,1.388889,10.5,65.0,11101.csv
6893,1986-03-23 18:00:00,1.388889,6.5,94.0,11101.csv
7318,1987-10-30 06:00:00,1.388889,11.5,95.0,11101.csv
7326,1987-11-03 17:00:00,0.555556,9.0,98.0,11101.csv


In [35]:
# save to csv
MoTIm_data_wind_clean.to_csv(f'{path_2}/MoTIm_data_wind_clean.csv')

#### MVan Data Cleaning

In [36]:
MVan_data = functions.to_df('MVan_data.csv')

In [37]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(MVan_data)

In [38]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
MVan_data.shape

column names: ['WDIR_VECT', ' time', ' WSPD_SCLR', ' HUMIDITY', 'filename']
column nulls: WDIR_VECT       6022
 time              0
 WSPD_SCLR    157014
 HUMIDITY     104506
filename           0
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


(283907, 5)

In [39]:
# only keep variables of interest
# no solar data available from this network
MVan_data_wind = MVan_data.loc[:,(' time', ' WSPD_SCLR', ' HUMIDITY', 'filename')] 
    # note that no air pressure or air temperature values are available

In [40]:
# count null values in wind data
MVan_data_wind.isna().sum()

 time              0
 WSPD_SCLR    157014
 HUMIDITY     104506
filename           0
dtype: int64

In [41]:
# drop rows in wind data that do not have WindSpeed values
MVan_data_wind_clean = MVan_data_wind.dropna(subset=[' WSPD_SCLR'])
MVan_data_wind_clean.isna().sum()

 time             0
 WSPD_SCLR        0
 HUMIDITY     56876
filename          0
dtype: int64

In [43]:
MVan_data_wind_clean[' WSPD_SCLR'].head(15)

0     1.38889
1     3.33334
2     1.66667
3     2.22222
4     1.94445
5     1.38889
6     2.22222
7     1.38889
8     1.66667
9     1.38889
10    2.22222
11    3.33334
12    3.61111
13    4.16667
14    3.05556
Name:  WSPD_SCLR, dtype: float64

In [45]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
MVan_data_wind_clean.loc[:,' WSPD_SCLR'] = MVan_data_wind_clean.loc[:,' WSPD_SCLR'].div(3.6)
MVan_data_wind_clean.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MVan_data_wind_clean.loc[:,' WSPD_SCLR'] = MVan_data_wind_clean.loc[:,' WSPD_SCLR'].div(3.6)


Unnamed: 0,time,WSPD_SCLR,HUMIDITY,filename
0,2000-01-01 00:00:00,0.385803,,T12.csv
1,2000-01-01 01:00:00,0.925928,,T12.csv
2,2000-01-01 02:00:00,0.462964,,T12.csv
3,2000-01-01 03:00:00,0.617283,,T12.csv
4,2000-01-01 04:00:00,0.540125,,T12.csv
5,2000-01-01 05:00:00,0.385803,,T12.csv
6,2000-01-01 06:00:00,0.617283,,T12.csv
7,2000-01-01 07:00:00,0.385803,,T12.csv
8,2000-01-01 08:00:00,0.462964,,T12.csv
9,2000-01-01 09:00:00,0.385803,,T12.csv


In [46]:
# save to csv
MVan_data_wind_clean.to_csv(f'{path_2}/MVan_data_wind_clean.csv')

#### UNBC_CAM Data Cleaning

In [47]:
UNBC_CAM_data = functions.to_df('UNBC_CAM_data.csv')

In [48]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(UNBC_CAM_data)

In [49]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')
UNBC_CAM_data.shape

column names: ['Rain_Tot', ' Solar_Avg', ' Solar_Std', ' WS_Std', ' AirTC', ' time', ' RH', ' WindDir_Avg', ' DBTCDT', 'filename', ' Pressure', 'WS_Std', ' WindDir_Std', ' WS_Avg', 'Solar_Avg', 'K_Dn_Avg', ' Rain_Tot', ' L_Dn_Avg', ' L_Up_Avg', ' K_Up_Avg']
column nulls: Rain_Tot        1022965
 Solar_Avg      1988163
 Solar_Std      2097928
 WS_Std          482661
 AirTC           178388
 time                 0
 RH              145556
 WindDir_Avg     780710
 DBTCDT          307674
filename              0
 Pressure        811623
WS_Std          2427244
 WindDir_Std    2392832
 WS_Avg         2045467
Solar_Avg       2596682
K_Dn_Avg        2305685
 Rain_Tot       2305689
 L_Dn_Avg       2305685
 L_Up_Avg       2305685
 K_Up_Avg       2305685
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


(2634998, 20)

In [50]:
# only keep variables of interest
UNBC_CAM_data_solar = UNBC_CAM_data.loc[:,(' time', 'Solar_Avg', ' Solar_Avg', 'filename')]
UNBC_CAM_data_wind = UNBC_CAM_data.loc[:,(' time', ' WS_Avg', ' AirTC', ' RH', ' Pressure', 'filename')]

In [53]:
# count null values in solar data
UNBC_CAM_data_solar.isna().sum()


 time               0
Solar_Avg     2596682
 Solar_Avg    1988163
filename            0
dtype: int64

In [54]:
# compare with overall size of solar data
UNBC_CAM_data_solar.shape

(2634998, 4)

In [55]:
# drop rows in solar data that do not have SolarRadiation values
UNBC_CAM_data_solar_clean = UNBC_CAM_data_solar.dropna(subset=['Solar_Avg', ' Solar_Avg'], how='all')
UNBC_CAM_data_solar_clean.isna().sum()

 time              0
Solar_Avg     646835
 Solar_Avg     38316
filename           0
dtype: int64

In [56]:
UNBC_CAM_data_solar_clean.to_csv(f'{path_2}/UNBC_CAM_data_solar_clean.csv')

In [57]:
# count null values in wind data
UNBC_CAM_data_wind.isna().sum()

 time              0
 WS_Avg      2045467
 AirTC        178388
 RH           145556
 Pressure     811623
filename           0
dtype: int64

In [58]:
# drop rows in wind data that do not have WindSpeed values
UNBC_CAM_data_wind_clean = UNBC_CAM_data_wind.dropna(subset=[' WS_Avg'])
UNBC_CAM_data_wind_clean.isna().sum()

 time            0
 WS_Avg          0
 AirTC       17648
 RH              0
 Pressure    18052
filename         0
dtype: int64

In [62]:
# save to csv
UNBC_CAM_data_wind_clean.to_csv(f'{path_2}/UNBC_CAM_data_wind_clean.csv')