### Initializing

In [1]:
import pandas as pd
import os 
import functions

In [2]:
# test for proper access to functions module
functions.temp()

hello


In [2]:
# define path variable for data folder containing concatenated .csv files
path_2 = '../data'

### Data cleaning
Objectives:
- remove irrelevant/null columns
- keep columns variables associated with wind power generation
    - wind speed
    - air density
        - air pressure
        - air temperature
        - relative humidity
- keep column variables associated with solar power generation
    - solar radiation
    - cloud cover fraction?
    - sunlight duration?
- standardize column data with common variables (units, format)

#### BCH Data Cleaning

In [4]:
functions.list_columns('BCH_data.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv', index_col=0)


['WindSpeed',
 ' time',
 ' WindDirection',
 'filename',
 'L_down_corr_Avg',
 ' HFT3_1_Avg',
 ' L_up_Avg',
 ' WindSpeed',
 ' ONE_DAY_SNOW',
 ' K_up_Avg',
 ' L_up_corr_Avg',
 ' AirTemp',
 ' L_down_Avg',
 ' HFT3_2_Avg',
 ' K_down_Avg',
 ' BarometricPressure',
 ' SoilVolumetricWaterContent',
 ' RelativeHumidity',
 ' TSoil_Avg',
 ' Vis',
 ' ONE_DAY_RAIN',
 'AirTemp',
 ' ONE_DAY_PRECIPITATION',
 ' MIN_TEMP',
 ' MAX_TEMP',
 ' Snow_WE',
 ' SNOW_ON_THE_GROUND',
 'HFT3_1_Avg',
 ' NetRad',
 'time']

In [28]:
# only include variables of interest
BCH_data_wind = BCH_data.loc[:,('time', ' time', 'WindSpeed', ' BarometricPressure', 'AirTemp', ' RelativeHumidity', 'filename')]
BCH_data_solar = BCH_data.loc[:,('time', ' time', ' NetRad', 'filename')]

##### BCH Wind Data Cleaning

In [16]:
# count null values in wind data
BCH_data_wind.isna().sum()

time                   6813622
 time                   203217
WindSpeed              6988625
 BarometricPressure    6833815
AirTemp                1496022
 RelativeHumidity      6827106
filename                     0
dtype: int64

In [29]:
# drop rows in wind data that do not have a WindSpeed value
# WindSpeed is a key variable that is important to location-specific results
BCH_data_wind_clean = BCH_data_wind.dropna(subset=['WindSpeed'])
BCH_data_wind_clean.isna().sum()

time                   28214
 time                      0
WindSpeed                  0
 BarometricPressure    28214
AirTemp                28214
 RelativeHumidity      28214
filename                   0
dtype: int64

In [36]:
# check whether number of remaining nulls are the same as the number of rows
BCH_data_wind_clean.shape

(28214, 7)

In [35]:
# check for duplicates
duplicate_rows = BCH_data_wind_clean.duplicated()
duplicate_rows[duplicate_rows].index

Int64Index([], dtype='int64')

In [38]:
# save to csv
BCH_data_wind_clean.to_csv('../data/BCH_data_wind_clean.csv')

Plan for handling remaining nulls:
- use values from next nearest weather station
- use averages from data as a whole if necessary

##### BCH Solar Data Cleaning

In [17]:
# count null values in solar data
BCH_data_solar.isna().sum()

time        6813622
 time        203217
 NetRad     6928912
filename          0
dtype: int64

In [18]:
# drop rows in solar data that do not have a NetRad value
BCH_data_solar_clean = BCH_data_solar.dropna(subset=[' NetRad'])
BCH_data_solar_clean.isna().sum()

time        87927
 time           0
 NetRad         0
filename        0
dtype: int64

In [24]:
# check whether number of null 'time' values is the same as number of rows
BCH_data_solar_clean.shape

(87927, 4)

In [39]:
# check for duplicates
duplicate_rows = BCH_data_solar_clean.duplicated()
duplicate_rows[duplicate_rows].index

Int64Index([], dtype='int64')

In [25]:
# drop 'time'
BCH_data_solar_clean.drop(columns=['time'], inplace=True)
BCH_data_solar_clean.shape

(87927, 3)

In [26]:
# save BCH solar data to csv
BCH_data_solar_clean.to_csv('../data/BCH_data_solar_clean.csv')

#### CRD Data Cleaning

In [7]:
CRD_data = functions.to_df('CRD_data.csv')

In [8]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(CRD_data)


In [9]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['SolarRadiation', ' AirTemperature', ' Precipitation', ' Rain', ' WindSpeed', ' SnowDepth', ' time', ' WindDirection', ' RelativeHumidity', 'filename', 'AirTemperature', ' SnowWaterEquivalent', 'RelativeHumidity']
column nulls: SolarRadiation          4090779
 AirTemperature         2812394
 Precipitation          3328235
 Rain                    438757
 WindSpeed               655130
 SnowDepth              3711426
 time                         0
 WindDirection           658523
 RelativeHumidity        652576
filename                      0
AirTemperature          1468700
 SnowWaterEquivalent    4078683
RelativeHumidity        4039005
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


In [10]:
# only keep variables of interest
CRD_data_solar = CRD_data.loc[:,(' time', 'SolarRadiation', 'filename')]
CRD_data_wind = CRD_data.loc[:,(' time', ' WindSpeed', 'AirTemperature', ' AirTemperature', 'RelativeHumidity', ' RelativeHumidity', 'filename')] 
    # note that no air pressure value is available

In [11]:
# drop rows in solar data that do not have SolarRadiation values
CRD_data_solar_clean = CRD_data_solar.dropna(subset=['SolarRadiation'])
CRD_data_solar_clean.isna().sum()

 time             0
SolarRadiation    0
filename          0
dtype: int64

In [9]:
CRD_data_solar_clean.to_csv(f'{path_2}/CRD_data_solar_clean.csv')

In [12]:
# count null values in wind data
CRD_data_wind.isna().sum()

 time                      0
 WindSpeed            655130
AirTemperature       1468700
 AirTemperature      2812394
RelativeHumidity     4039005
 RelativeHumidity     652576
filename                   0
dtype: int64

In [21]:
# drop rows in wind data that do not have WindSpeed values
CRD_data_wind_clean = CRD_data_wind.dropna(subset=[' WindSpeed'])
CRD_data_wind_clean.isna().sum()

 time                      0
 WindSpeed                 0
AirTemperature        817039
 AirTemperature      2808351
RelativeHumidity     3445441
 RelativeHumidity       1181
filename                   0
dtype: int64

In [22]:
CRD_data_wind_clean[' WindSpeed'].head(15)

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
5      0.0
6      0.0
7      0.0
8      0.0
9      3.8
10     3.9
11     0.0
12     6.6
13    10.9
14     8.4
Name:  WindSpeed, dtype: float64

In [23]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
CRD_data_wind_clean.loc[:,' WindSpeed'] = CRD_data_wind_clean.loc[:,' WindSpeed'].div(3.6)
CRD_data_wind_clean.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CRD_data_wind_clean.loc[:,' WindSpeed'] = CRD_data_wind_clean.loc[:,' WindSpeed'].div(3.6)


Unnamed: 0,time,WindSpeed,AirTemperature,AirTemperature.1,RelativeHumidity,RelativeHumidity.1,filename
0,1998-04-17 00:00:00,0.0,,0.9,,100.0,FW001.csv
1,1998-04-17 01:00:00,0.0,,1.2,,100.0,FW001.csv
2,1998-04-17 02:00:00,0.0,,0.7,,100.0,FW001.csv
3,1998-04-17 03:00:00,0.0,,0.9,,100.0,FW001.csv
4,1998-04-17 04:00:00,0.0,,0.1,,100.0,FW001.csv
5,1998-04-17 05:00:00,0.0,,0.1,,100.0,FW001.csv
6,1998-04-17 06:00:00,0.0,,0.4,,100.0,FW001.csv
7,1998-04-17 07:00:00,0.0,,2.7,,100.0,FW001.csv
8,1998-04-17 08:00:00,0.0,,8.2,,74.0,FW001.csv
9,1998-04-17 09:00:00,1.055556,,9.4,,64.0,FW001.csv


In [24]:
# save to csv
CRD_data_wind_clean.to_csv(f'{path_2}/CRD_data_wind_clean.csv')

#### EC_raw Data Cleaning

In [3]:
EC_raw_data = functions.to_df('EC_raw_data.csv')

In [4]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(EC_raw_data)

In [5]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: ['wind_speed', ' mean_sea_level', ' dew_point', ' wind_direction', ' relative_humidity', ' air_temperature', ' time', ' air_temperature_yesterday_low', ' wind_gust_speed', ' air_temperature_yesterday_high', ' tendency_amount', 'filename', ' total_precipitation', 'mean_sea_level', 'wind_direction', ' wind_speed', ' snow_amount', ' total_cloud_cover', 'relative_humidity', 'dew_point']
column nulls: wind_speed                          6006263
 mean_sea_level                     1709535
 dew_point                          1035783
 wind_direction                     6420851
 relative_humidity                  1027746
 air_temperature                     411231
 time                                     0
 air_temperature_yesterday_low     12391030
 wind_gust_speed                   11440570
 air_temperature_yesterday_high    12390760
 tendency_amount                    2839650
filename                                  0
 total_precipitation               12295547
mean_sea_level

In [25]:
# only keep variables of interest
# no solar data available from this network
EC_raw_data_wind = EC_raw_data.loc[:,(' time', 'wind_speed', ' wind_speed', ' air_temperature', 'relative_humidity', ' relative_humidity', 'filename')] 
    # note that no air pressure column is available

In [26]:
# count null values in wind data
EC_raw_data_wind.isna().sum()

 time                        0
wind_speed             6006263
 wind_speed            7785805
 air_temperature        411231
relative_humidity     12687542
 relative_humidity     1027746
filename                     0
dtype: int64

In [27]:
# drop rows in wind data that do not have values in either windspeed column
EC_raw_data_wind_clean = EC_raw_data_wind.dropna(subset=['wind_speed', ' wind_speed'], how='all')
EC_raw_data_wind_clean.isna().sum()

 time                        0
wind_speed             4993828
 wind_speed            6773370
 air_temperature        331537
relative_humidity     11767198
 relative_humidity      802170
filename                     0
dtype: int64

In [29]:
EC_raw_data_wind_clean.head(15)

Unnamed: 0,time,wind_speed,wind_speed.1,air_temperature,relative_humidity,relative_humidity.1,filename
138,2011-10-16 00:00:00,16.9,,9.9,,78.0,1012475.csv
139,2011-10-16 01:00:00,16.2,,9.9,,78.0,1012475.csv
140,2011-10-16 02:00:00,10.4,,9.7,,82.0,1012475.csv
141,2011-10-16 03:00:00,9.4,,9.3,,83.0,1012475.csv
142,2011-10-16 04:00:00,13.3,,9.1,,83.0,1012475.csv
143,2011-10-16 05:00:00,15.5,,9.1,,84.0,1012475.csv
144,2011-10-16 06:00:00,17.3,,8.8,,84.0,1012475.csv
145,2011-10-16 07:00:00,16.9,,8.8,,83.0,1012475.csv
146,2011-10-16 08:00:00,16.2,,8.4,,81.0,1012475.csv
147,2011-10-16 09:00:00,8.3,,7.7,,83.0,1012475.csv


In [30]:
# convert windspeed values from km/h to m/s by dividing values by 3.6
EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')] = EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')].div(3.6)
EC_raw_data_wind_clean.head(15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')] = EC_raw_data_wind_clean.loc[:,('wind_speed', ' wind_speed')].div(3.6)


Unnamed: 0,time,wind_speed,wind_speed.1,air_temperature,relative_humidity,relative_humidity.1,filename
138,2011-10-16 00:00:00,4.694444,,9.9,,78.0,1012475.csv
139,2011-10-16 01:00:00,4.5,,9.9,,78.0,1012475.csv
140,2011-10-16 02:00:00,2.888889,,9.7,,82.0,1012475.csv
141,2011-10-16 03:00:00,2.611111,,9.3,,83.0,1012475.csv
142,2011-10-16 04:00:00,3.694444,,9.1,,83.0,1012475.csv
143,2011-10-16 05:00:00,4.305556,,9.1,,84.0,1012475.csv
144,2011-10-16 06:00:00,4.805556,,8.8,,84.0,1012475.csv
145,2011-10-16 07:00:00,4.694444,,8.8,,83.0,1012475.csv
146,2011-10-16 08:00:00,4.5,,8.4,,81.0,1012475.csv
147,2011-10-16 09:00:00,2.305556,,7.7,,83.0,1012475.csv


In [31]:
# save to csv
EC_raw_data_wind_clean.to_csv(f'{path_2}/EC_raw_data_wind_clean.csv')