# Weather Observation EDA

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
import math
pd.options.mode.chained_assignment = None

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from preprocess.functions.date_inspector import load_files_legacy

In [None]:
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'raw', 'kma'))
print(data_dir)

In [None]:
observation_list = ['df_kma_obs_Chungcheongnam-do_Seosan-si_Suseok-dong_2017.pkl',
                   'df_kma_obs_Chungcheongnam-do_Seosan-si_Suseok-dong_2018.pkl']

forecast_list = ['df_kma_forecast_Chungcheongnam-do_Dangjin-si_Seongmun-myeon_2017.pkl',
                'df_kma_forecast_Chungcheongnam-do_Dangjin-si_Seongmun-myeon_2018.pkl']

solar_list = ['df_kma_solar_Chungcheongnam-do_Seosan-si_Suseok-dong_2017.pkl',
             'df_kma_solar_Chungcheongnam-do_Seosan-si_Suseok-dong_2018.pkl']

filename_power = 'df_ewp_a.pkl'

df_observation, df_forecast, df_solar, df_power = load_files_legacy(observation_list, forecast_list, solar_list, filename_power)

# Relationship between columns

In [None]:
df = df_observation.copy()
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofyear'] = df['datetime'].dt.dayofyear
df = df.drop(['datetime', 'date', 'location'], axis=1)

df = df.rename(columns={'기온(°C)':'Celsius(°C)',
                       '강수량(mm)':'Rainfall(mm)',
                       '풍속(m/s)':'Wind Speed(m/s)',
                       '풍향(16방위)':'Wind Direction(16)',
                       '습도(%)':'Humidity(%)',
                       '일사(MJ/m2)':'Solar Radiation(MJ/m^2)',
                       '적설(cm)':'Snowfall(cm)',
                       '전운량(10분위)':'Cloud'})

display(df.head())
display(df.tail())

In [None]:
for column in df_observation.columns :
    print(df_observation[column].isna().value_counts())

## Correlation

In [None]:
from functions_eda import show_correlation
show_correlation(df)

## Normalized Mutual Information

In [None]:
from functions_eda import show_normalized_mutual_information
show_normalized_mutual_information(df)

## Distribution plot of each column

In [None]:
for idx, column in enumerate(df.columns) :
    if idx%3 == 0:
        plt.figure(figsize=(15, 3))
    plt.subplot(1, 3, (idx%3)+1)
    plt.title(column)
    sns.distplot(df[column].interpolate(method='linear')) # due to few NA existing
    if idx%3 == 2 :
        plt.show()

## Solar Radiation versus each column (Scatterplot)

In [None]:
for idx, column in enumerate(df.columns) :
    if idx%3 == 0:
        plt.figure(figsize=(15, 3))
    plt.subplot(1, 3, (idx%3)+1)
    plt.title(column)
    sns.scatterplot(df[column].interpolate(method='linear'), df['Solar Radiation(MJ/m^2)'], s=10, alpha=0.3)
    if idx%3 == 2 :
        plt.show()

## Solar Radiation versus each column (Relative Density Plot)

In [None]:
from functions_eda import show_relative_density_plot
target = 'Solar Radiation(MJ/m^2)'
show_relative_density_plot(df, target)

## Power Generation versus each column (Relative Density Plot)

In [None]:
def merge_obs_power(df_obs, df_power) :
    df_power['month'] = df_power['datetime'].dt.month
    df_power['day'] = df_power['datetime'].dt.day
    df_power['hour'] = df_power['datetime'].dt.hour
    df_power['dayofyear'] = df_power['datetime'].dt.dayofyear
    df_merged = pd.merge(df_observation.drop(['location'], axis=1), df_power.drop(['location', '일사량(MJ/m^2)'], axis=1), how='outer', on=['datetime', 'date'])
    df_merged = df_merged[df_merged['발전량(kW)'].notnull()]
    df_merged = df_merged.drop(['datetime', 'date'], axis=1)
    df_merged = df_merged.rename(columns={'기온(°C)':'Celsius(°C)',
                           '강수량(mm)':'Rainfall(mm)',
                           '풍속(m/s)':'Wind Speed(m/s)',
                           '풍향(16방위)':'Wind Direction(16)',
                           '습도(%)':'Humidity(%)',
                           '일사(MJ/m2)':'Solar Radiation(MJ/m2)',
                           '적설(cm)':'Snowfall(cm)',
                           '전운량(10분위)':'Cloud',
                            '발전량(kW)':'Power Generation(kW)'})
    return df_merged
    

### Plant A

In [None]:
df_merged = merge_obs_power(df_observation, df_power)
target = 'Power Generation(kW)'
show_relative_density_plot(df_merged, target)

### Plant B

In [None]:
filename_power = 'df_ewp_b.pkl'
df_observation, df_forecast, df_solar, df_power = load_files(observation_list, forecast_list, solar_list, filename_power)
df_merged = merge_obs_power(df_observation, df_power)
target = 'Power Generation(kW)'
show_relative_density_plot(df_merged, target)

### Plant C

In [None]:
filename_power = 'df_ewp_c.pkl'
df_observation, df_forecast, df_solar, df_power = load_files(observation_list, forecast_list, solar_list, filename_power)
df_merged = merge_obs_power(df_observation, df_power)
target = 'Power Generation(kW)'
show_relative_density_plot(df_merged, target)

## Correlation, NMI with Actual Power Generation

### Plant A

In [None]:
filename_power = 'df_ewp_a.pkl'
df_observation, df_forecast, df_solar, df_power = load_files(observation_list, forecast_list, solar_list, filename_power)
df_merged = merge_obs_power(df_observation, df_power)

In [None]:
show_correlation(df_merged)

In [None]:
show_normalized_mutual_information(df_merged, how_many=5)

### Plant B

In [None]:
filename_power = 'df_ewp_b.pkl'
df_observation, df_forecast, df_solar, df_power = load_files(observation_list, forecast_list, solar_list, filename_power)
df_merged = merge_obs_power(df_observation, df_power)

In [None]:
show_correlation(df_merged)

In [None]:
show_normalized_mutual_information(df_merged, how_many=5)

### Plant C

In [None]:
filename_power = 'df_ewp_c.pkl'
df_observation, df_forecast, df_solar, df_power = load_files(observation_list, forecast_list, solar_list, filename_power)
df_merged = merge_obs_power(df_observation, df_power)

In [None]:
show_normalized_mutual_information(df_merged, how_many=5)