# 전남 해남 기상관측치 EDA

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
import math
pd.options.mode.chained_assignment = None

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

from preprocess.functions.date_inspector import load_files
from functions.eda import show_correlation, show_normalized_mutual_information, show_relative_density_plot

In [4]:
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'raw', 'kma'))
print(data_dir)

/home/jeon/Desktop/kpx/data/raw/kma


In [7]:
observation_list = ['df_kma_obs_Jeollanam-do_Haenam-gun_Haenam-eup_2017.pkl',
                   'df_kma_obs_Jeollanam-do_Haenam-gun_Haenam-eup_2018.pkl',
                   'df_kma_obs_Jeollanam-do_Haenam-gun_Haenam-eup_2019.pkl']

forecast_list = ['df_kma_forecast_Jeollanam-do_Haenam-gun_Hwangsan-myeon_2017.pkl',
                'df_kma_forecast_Jeollanam-do_Haenam-gun_Hwangsan-myeon_2018.pkl',
                'df_kma_forecast_Jeollanam-do_Haenam-gun_Hwangsan-myeon_2019.pkl']

filename_power = 'df_kpx_solar.pkl'

df_observation, df_forecast, df_power = load_files(observation_list, forecast_list, filename_power)

In [None]:
display(df_observation.head())
display(df_observation.tail())

## Relationship between columns

In [None]:
df = df_observation.copy()
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['dayofyear'] = df['datetime'].dt.dayofyear
df = df.drop(['datetime', 'date', 'location'], axis=1)

df = df.rename(columns={'기온(°C)':'Celsius(°C)',
                       '강수량(mm)':'Rainfall(mm)',
                       '풍속(m/s)':'Wind Speed(m/s)',
                       '풍향(16방위)':'Wind Direction(16)',
                       '습도(%)':'Humidity(%)',
                       '일사(MJ/m2)':'Solar Radiation(MJ/m^2)',
                       '적설(cm)':'Snowfall(cm)',
                       '전운량(10분위)':'Cloud'})

display(df.head())
display(df.tail())

In [None]:
def merge_obs_power(df_obs, df_power) :
    df_obs['datetime'] = pd.to_datetime(df_obs['datetime'])
    df_obs['date'] = pd.to_datetime(df_obs['date'])
    
    df_power['month'] = df_power['datetime'].dt.month
    df_power['day'] = df_power['datetime'].dt.day
    df_power['hour'] = df_power['datetime'].dt.hour
    df_power['dayofyear'] = df_power['datetime'].dt.dayofyear
    df_power = df_power.drop(['일사량(MJ/m^2)'], axis=1)
    
    df_merged = pd.merge(df_observation.drop(['location'], axis=1), df_power.drop(['location'], axis=1), how='outer', on=['datetime', 'date'])
    df_merged = df_merged[df_merged['발전량(kW)'].notnull()]
    df_merged = df_merged.drop(['datetime', 'date'], axis=1)
    df_merged['발전량(kW)'] = df_merged['발전량(kW)'].astype('float64')
    df_merged = df_merged.rename(columns={'기온(°C)':'Celsius(°C)',
                           '강수량(mm)':'Rainfall(mm)',
                           '풍속(m/s)':'Wind Speed(m/s)',
                           '풍향(16방위)':'Wind Direction(16)',
                           '습도(%)':'Humidity(%)',
                        '일사(MJ/m2)' : 'Solar Radiation(MJ/m^2)',
                           '적설(cm)':'Snowfall(cm)',
                           '전운량(10분위)':'Cloud',
                            '발전량(kW)':'Power Generation(kW)'})
    
    
    # ???
    df_merged = df_merged[df_merged['Rainfall(mm)'].notnull()]
    return df_merged

In [None]:
df_merged = merge_obs_power(df_observation, df_power)
display(df_merged.tail())
display(df_merged.head())
df_merged.info()

## Correlation

In [None]:
show_correlation(df_merged)

## Normalized Mutual Information

In [None]:
show_normalized_mutual_information(df_merged)

## Distribution plot of each column

In [None]:
for idx, column in enumerate(df_merged.columns) :
    if idx%3 == 0:
        plt.figure(figsize=(15, 3))
    plt.subplot(1, 3, (idx%3)+1)
    plt.title(column)
    try : 
        sns.distplot(df_merged[column].interpolate(method='linear')) # due to few NA existing
    except Exception as e :
        print(e)
    if idx%3 == 2 :
        plt.show()

## Power Generation versus each column (Scatterplot)

In [None]:
for idx, column in enumerate(df_merged.columns) :
    if idx%3 == 0:
        plt.figure(figsize=(15, 3))
    plt.subplot(1, 3, (idx%3)+1)
    plt.title(column)
    sns.scatterplot(df_merged[column].interpolate(method='linear'), df_merged['Power Generation(kW)'], s=10, alpha=0.3)
    if idx%3 == 2 :
        plt.show()

## Power Generation versus each column (Relative Density Plot)

In [None]:
target = 'Power Generation(kW)'
show_relative_density_plot(df_merged, target)