### Initializing

In [1]:
import pandas as pd
import os 
import functions

In [2]:
# test for proper access to functions module
functions.temp()

hello


In [8]:
# define path variable for data folder containing concatenated .csv files
path_2 = '../data'

### Data cleaning
Objectives:
- remove irrelevant/null columns
- keep columns variables associated with wind power generation
    - wind speed
    - air density
        - air pressure
        - air temperature
        - relative humidity
- keep column variables associated with solar power generation
    - solar radiation
    - cloud cover fraction?
    - sunlight duration?
- standardize column data with common variables (units, format)

#### BCH Data Cleaning

In [4]:
functions.list_columns('BCH_data.csv')

  df_name = pd.read_csv(f'{path_2}/{data_name}.csv', index_col=0)


['WindSpeed',
 ' time',
 ' WindDirection',
 'filename',
 'L_down_corr_Avg',
 ' HFT3_1_Avg',
 ' L_up_Avg',
 ' WindSpeed',
 ' ONE_DAY_SNOW',
 ' K_up_Avg',
 ' L_up_corr_Avg',
 ' AirTemp',
 ' L_down_Avg',
 ' HFT3_2_Avg',
 ' K_down_Avg',
 ' BarometricPressure',
 ' SoilVolumetricWaterContent',
 ' RelativeHumidity',
 ' TSoil_Avg',
 ' Vis',
 ' ONE_DAY_RAIN',
 'AirTemp',
 ' ONE_DAY_PRECIPITATION',
 ' MIN_TEMP',
 ' MAX_TEMP',
 ' Snow_WE',
 ' SNOW_ON_THE_GROUND',
 'HFT3_1_Avg',
 ' NetRad',
 'time']

In [28]:
# only include variables of interest
BCH_data_wind = BCH_data.loc[:,('time', ' time', 'WindSpeed', ' BarometricPressure', 'AirTemp', ' RelativeHumidity', 'filename')]
BCH_data_solar = BCH_data.loc[:,('time', ' time', ' NetRad', 'filename')]

##### BCH Wind Data Cleaning

In [16]:
# count null values in wind data
BCH_data_wind.isna().sum()

time                   6813622
 time                   203217
WindSpeed              6988625
 BarometricPressure    6833815
AirTemp                1496022
 RelativeHumidity      6827106
filename                     0
dtype: int64

In [29]:
# drop rows in wind data that do not have a WindSpeed value
# WindSpeed is a key variable that is important to location-specific results
BCH_data_wind_clean = BCH_data_wind.dropna(subset=['WindSpeed'])
BCH_data_wind_clean.isna().sum()

time                   28214
 time                      0
WindSpeed                  0
 BarometricPressure    28214
AirTemp                28214
 RelativeHumidity      28214
filename                   0
dtype: int64

In [36]:
# check whether number of remaining nulls are the same as the number of rows
BCH_data_wind_clean.shape

(28214, 7)

In [35]:
# check for duplicates
duplicate_rows = BCH_data_wind_clean.duplicated()
duplicate_rows[duplicate_rows].index

Int64Index([], dtype='int64')

In [38]:
# save to csv
BCH_data_wind_clean.to_csv('../data/BCH_data_wind_clean.csv')

Plan for handling remaining nulls:
- use values from next nearest weather station
- use averages from data as a whole if necessary

##### BCH Solar Data Cleaning

In [17]:
# count null values in solar data
BCH_data_solar.isna().sum()

time        6813622
 time        203217
 NetRad     6928912
filename          0
dtype: int64

In [18]:
# drop rows in solar data that do not have a NetRad value
BCH_data_solar_clean = BCH_data_solar.dropna(subset=[' NetRad'])
BCH_data_solar_clean.isna().sum()

time        87927
 time           0
 NetRad         0
filename        0
dtype: int64

In [24]:
# check whether number of null 'time' values is the same as number of rows
BCH_data_solar_clean.shape

(87927, 4)

In [39]:
# check for duplicates
duplicate_rows = BCH_data_solar_clean.duplicated()
duplicate_rows[duplicate_rows].index

Int64Index([], dtype='int64')

In [25]:
# drop 'time'
BCH_data_solar_clean.drop(columns=['time'], inplace=True)
BCH_data_solar_clean.shape

(87927, 3)

In [26]:
# save BCH solar data to csv
BCH_data_solar_clean.to_csv('../data/BCH_data_solar_clean.csv')

#### CRD Data Cleaning

In [3]:
CRD_data = functions.to_df('CRD_data.csv')

In [4]:
column_names, column_nulls, duplicate_row_index = functions.start_cleaning(CRD_data)


In [5]:
print(f'column names: {column_names}')
print(f'column nulls: {column_nulls}')
print(f'duplicate rows: {duplicate_row_index}')

column names: 0           SolarRadiation
1           AirTemperature
2            Precipitation
3                     Rain
4                WindSpeed
5                SnowDepth
6                     time
7            WindDirection
8         RelativeHumidity
9                 filename
10          AirTemperature
11     SnowWaterEquivalent
12        RelativeHumidity
dtype: object
column nulls: SolarRadiation          4090779
 AirTemperature         2812394
 Precipitation          3328235
 Rain                    438757
 WindSpeed               655130
 SnowDepth              3711426
 time                         0
 WindDirection           658523
 RelativeHumidity        652576
filename                      0
AirTemperature          1468700
 SnowWaterEquivalent    4078683
RelativeHumidity        4039005
dtype: int64
duplicate rows: Int64Index([], dtype='int64')


In [6]:
# only keep variables of interest
CRD_data_solar = CRD_data.loc[:,(' time', 'SolarRadiation', 'filename')]
CRD_data_wind = CRD_data.loc[:,(' time', ' WindSpeed', 'AirTemperature', ' AirTemperature', 'RelativeHumidity', ' RelativeHumidity', 'filename')] 
    # note that no air pressure value is available

In [7]:
# drop rows in solar data that do not have SolarRadiation values
CRD_data_solar_clean = CRD_data_solar.dropna(subset=['SolarRadiation'])
CRD_data_solar_clean.isna().sum()

 time             0
SolarRadiation    0
filename          0
dtype: int64

In [9]:
CRD_data_solar_clean.to_csv(f'{path_2}/CRD_data_solar_clean.csv')

In [10]:
# count null values in wind data
CRD_data_wind.isna().sum()

 time                      0
 WindSpeed            655130
AirTemperature       1468700
 AirTemperature      2812394
RelativeHumidity     4039005
 RelativeHumidity     652576
filename                   0
dtype: int64

In [12]:
# drop rows in wind data that do not have WindSpeed values
CRD_data_wind_clean = CRD_data_wind.dropna(subset=[' WindSpeed'])
CRD_data_wind_clean.isna().sum()

 time                      0
 WindSpeed                 0
AirTemperature        817039
 AirTemperature      2808351
RelativeHumidity     3445441
 RelativeHumidity       1181
filename                   0
dtype: int64

In [13]:
# save to csv
CRD_data_wind_clean.to_csv(f'{path_2}/CRD_data_wind_clean.csv')