## LOAD LIBRARY

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import matplotlib.dates as mdates

## LOAD DATA

In [2]:
df_ori = pd.read_csv("data/w771dz_sangamura_20240901-20240930.csv")
df_ori.head()

Unnamed: 0,date,time,temperature,humidity,light,rainfall_5min,rainfall_1hour,wind_speed,wind_direction,atmospheric_pressure
0,2024/09/01,00:04,22.2,90.0,,0.0,,0.0,northwest,972
1,2024/09/01,00:09,22.1,90.0,,0.0,,0.0,northwest,972
2,2024/09/01,00:14,22.1,90.0,,0.0,,0.0,northwest,972
3,2024/09/01,00:19,22.0,90.0,,0.0,,0.0,northwest,972
4,2024/09/01,00:24,21.9,90.0,,0.0,,0.0,northwest,972


In [5]:
df_ori.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8046 entries, 0 to 8045
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  8046 non-null   object 
 1   time                  8046 non-null   object 
 2   temperature           8046 non-null   float64
 3   humidity              8046 non-null   float64
 4   light                 4612 non-null   float64
 5   rainfall_5min         8046 non-null   float64
 6   rainfall_1hour        719 non-null    float64
 7   wind_speed            8046 non-null   float64
 8   wind_direction        8046 non-null   object 
 9   atmospheric_pressure  8046 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 628.7+ KB


In [7]:
df_ori.isnull().sum()

date                       0
time                       0
temperature                0
humidity                   0
light                   3434
rainfall_5min              0
rainfall_1hour          7327
wind_speed                 0
wind_direction             0
atmospheric_pressure       0
dtype: int64

## DATA PREPROCESSING

### Combine

In [None]:
df_ori['datetime'] = pd.to_datetime(df_ori['date'] + ' ' + df_ori['time'], format='%Y/%m/%d %H:%M')

In [10]:
df_clean = df_ori.set_index('datetime')
df_clean.head()

Unnamed: 0_level_0,date,time,temperature,humidity,light,rainfall_5min,rainfall_1hour,wind_speed,wind_direction,atmospheric_pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-09-01 00:04:00,2024/09/01,00:04,22.2,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:09:00,2024/09/01,00:09,22.1,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:14:00,2024/09/01,00:14,22.1,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:19:00,2024/09/01,00:19,22.0,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:24:00,2024/09/01,00:24,21.9,90.0,,0.0,,0.0,northwest,972


### Conversion

In [12]:
df_clean = df_clean.replace('', np.nan)

for col in df_clean.columns:
    if df_clean[col].dtype == 'object' and col not in ['date', 'time', 'wind_direction']:
        df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8046 entries, 2024-09-01 00:04:00 to 2024-09-30 23:59:00
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   date                  8046 non-null   object 
 1   time                  8046 non-null   object 
 2   temperature           8046 non-null   float64
 3   humidity              8046 non-null   float64
 4   light                 4612 non-null   float64
 5   rainfall_5min         8046 non-null   float64
 6   rainfall_1hour        719 non-null    float64
 7   wind_speed            8046 non-null   float64
 8   wind_direction        8046 non-null   object 
 9   atmospheric_pressure  8046 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 691.5+ KB


In [13]:
df_clean.describe()

Unnamed: 0,temperature,humidity,light,rainfall_5min,rainfall_1hour,wind_speed,atmospheric_pressure
count,8046.0,8046.0,4612.0,8046.0,719.0,8046.0,8046.0
mean,23.291847,69.044308,10757.831526,0.001243,0.013352,0.000472,978.790082
std,4.033439,31.621529,18374.117175,0.022263,0.137789,0.009961,3.574449
min,12.9,10.0,0.0,0.0,0.0,0.0,965.0
25%,21.0,55.7,523.0,0.0,0.0,0.0,976.0
50%,23.0,90.0,3528.0,0.0,0.0,0.0,979.0
75%,25.4,90.0,12022.0,0.0,0.0,0.0,982.0
max,37.9,90.0,93436.0,0.4,2.8,0.5,986.0


### Resampled hourly df analyses

In [25]:
df_clean.head()

Unnamed: 0_level_0,date,time,temperature,humidity,light,rainfall_5min,rainfall_1hour,wind_speed,wind_direction,atmospheric_pressure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-09-01 00:04:00,2024/09/01,00:04,22.2,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:09:00,2024/09/01,00:09,22.1,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:14:00,2024/09/01,00:14,22.1,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:19:00,2024/09/01,00:19,22.0,90.0,,0.0,,0.0,northwest,972
2024-09-01 00:24:00,2024/09/01,00:24,21.9,90.0,,0.0,,0.0,northwest,972


In [None]:
df_hourly = df_clean.resample('h')
# df_hourly.mean()

## EXPLORATION & VISUALIZATION

## MODEL