# Notebook for doing feature engineering


In [34]:
## reading in the data 

%store -r data_A
%store -r data_B
%store -r data_C

data_A = data_A
data_B = data_B
data_C = data_C

%store -r X_test_estimated_a_corr 
%store -r X_test_estimated_b_corr 
%store -r X_test_estimated_c_corr

X_test_estimated_a_corr = X_test_estimated_a_corr 
X_test_estimated_b_corr = X_test_estimated_b_corr
X_test_estimated_c_corr = X_test_estimated_c_corr


## Making months its own feature 

Idea: The month of the year will impact the weather trend and therefore the power generation


In [35]:

months = data_A["date_forecast"].dt.month
data_A["month"] = months

months = data_B["date_forecast"].dt.month
data_B["month"] = months

months = data_C["date_forecast"].dt.month
data_C["month"] = months

months = X_test_estimated_a_corr["date_forecast"].dt.month
X_test_estimated_a_corr["month"] = months

months = X_test_estimated_b_corr["date_forecast"].dt.month
X_test_estimated_b_corr["month"] = months

months = X_test_estimated_c_corr["date_forecast"].dt.month
X_test_estimated_c_corr["month"] = months



## Making time of day its own feature

Idea: time of day matters when the sun is up and not.

In [42]:

hours = data_A["date_forecast"].dt.hour
data_A["hours"] = hours

hours = data_B["date_forecast"].dt.hour
data_B["hours"] = hours

hours = data_C["date_forecast"].dt.hour
data_C["hours"] = hours

hours = X_test_estimated_a_corr["date_forecast"].dt.hour
X_test_estimated_a_corr["hours"] = hours

hours = X_test_estimated_b_corr["date_forecast"].dt.hour
X_test_estimated_b_corr["hours"] = hours

hours = X_test_estimated_c_corr["date_forecast"].dt.hour
X_test_estimated_c_corr["hours"] = hours


## Making the sum of radiation its own feature 

Idea: we know from the feature scores that radiation is one of the most important ones. Therefore we make a feature that is its sum to emphisise its contribution. 

In [38]:

data_A["sum_rad:W"] = data_A["clear_sky_rad:W"] + data_A["diffuse_rad:W"] + data_A["direct_rad:W"]
data_B["sum_rad:W"] = data_B["clear_sky_rad:W"] + data_B["diffuse_rad:W"] + data_B["direct_rad:W"]
data_C["sum_rad:W"] = data_C["clear_sky_rad:W"] + data_C["diffuse_rad:W"] + data_C["direct_rad:W"]

X_test_estimated_a_corr["sum_rad:W"] = X_test_estimated_a_corr["clear_sky_rad:W"] + X_test_estimated_a_corr["diffuse_rad:W"] + X_test_estimated_a_corr["direct_rad:W"]
X_test_estimated_b_corr["sum_rad:W"] = X_test_estimated_b_corr["clear_sky_rad:W"] + X_test_estimated_b_corr["diffuse_rad:W"] + X_test_estimated_b_corr["direct_rad:W"]
X_test_estimated_c_corr["sum_rad:W"] = X_test_estimated_c_corr["clear_sky_rad:W"] + X_test_estimated_c_corr["diffuse_rad:W"] + X_test_estimated_c_corr["direct_rad:W"]


## Making the ratio between cloud cover and radiation its own feature
Idea: we know that cloud corverage does affect the amount of power generated by solar cells. Therefore looking at the ratio between these might give insight into their relation

Also, looking at the values of sum_rad we see they are much bigger than the total_cloud_cover values, we therefore take the second power of cloud coverage

In [39]:
data_A["rad_cloud_ratio"] = data_A["sum_rad:W"] / ((data_A["total_cloud_cover:p"] * data_A["total_cloud_cover:p"]) + 1)
data_B["rad_cloud_ratio"] = data_B["sum_rad:W"] / ((data_B["total_cloud_cover:p"] * data_B["total_cloud_cover:p"]) + 1)
data_C["rad_cloud_ratio"] = data_C["sum_rad:W"] / ((data_C["total_cloud_cover:p"] * data_C["total_cloud_cover:p"]) + 1)

X_test_estimated_a_corr["rad_cloud_ratio"] = X_test_estimated_a_corr["sum_rad:W"] / ((X_test_estimated_a_corr["total_cloud_cover:p"] * X_test_estimated_a_corr["total_cloud_cover:p"]) + 1)
X_test_estimated_b_corr["rad_cloud_ratio"] = X_test_estimated_b_corr["sum_rad:W"] / ((X_test_estimated_b_corr["total_cloud_cover:p"] * X_test_estimated_b_corr["total_cloud_cover:p"]) + 1)
X_test_estimated_c_corr["rad_cloud_ratio"] = X_test_estimated_c_corr["sum_rad:W"] / ((X_test_estimated_c_corr["total_cloud_cover:p"] * X_test_estimated_c_corr["total_cloud_cover:p"]) + 1)


### Running the store block will store the datasets from this notebook, with its engineered features, to be used in other notebooks


In [40]:
%store data_A
%store data_B
%store data_C

%store X_test_estimated_a_corr
%store X_test_estimated_b_corr
%store X_test_estimated_c_corr

Stored 'data_A' (DataFrame)
Stored 'data_B' (DataFrame)
Stored 'data_C' (DataFrame)
Stored 'X_test_estimated_a_corr' (DataFrame)
Stored 'X_test_estimated_b_corr' (DataFrame)
Stored 'X_test_estimated_c_corr' (DataFrame)
