In [213]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from datetime import datetime

In [214]:
test = pd.read_csv("test.csv") #Dataframe untuk Test machine learning kita
train = pd.read_csv("train.csv") #Dataframe yg akan kita latih

In [215]:
show = pd.concat([train.head(4), train.tail(4)])
show

Unnamed: 0,ID,date,cluster_id,electricity_consumption,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,sunshine_duration,daylight_duration,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration
0,cluster_1_2014-01-01,2014-01-01,cluster_1,358.032,10.8,4.2,5.5,0.4,53.003333,29787.533333,40.6,79.2,186.0,1.176667,0.483333
1,cluster_2_2014-01-01,2014-01-01,cluster_2,548.247,12.2,4.3,8.6,-0.4,8195.656667,30650.35,36.0,81.7,170.591118,3.383333,0.736667
2,cluster_3_2014-01-01,2014-01-01,cluster_3,758.303,12.9,-0.8,10.1,-4.9,16305.26,31547.686667,20.9,44.3,159.467752,3.88,0.716667
3,cluster_4_2014-01-01,2014-01-01,cluster_4,1072.077,10.8,4.7,6.7,0.6,9224.803333,30769.22,34.7,82.8,184.339753,3.153333,0.676667
11684,cluster_1_2021-12-31,2021-12-31,cluster_1,354.565,14.7,7.7,13.4,5.2,19148.153333,29727.2,20.7,37.4,201.332279,3.51,0.513333
11685,cluster_2_2021-12-31,2021-12-31,cluster_2,507.51,16.2,7.0,16.3,4.5,20570.92,30594.28,23.3,40.7,186.250597,5.006667,0.733333
11686,cluster_3_2021-12-31,2021-12-31,cluster_3,705.999,15.9,5.9,15.9,3.2,26860.35,31496.173333,15.1,27.7,196.223902,5.783333,0.72
11687,cluster_4_2021-12-31,2021-12-31,cluster_4,992.349,14.6,5.5,13.3,3.3,23063.0,30713.893333,16.4,28.8,192.980343,4.816667,0.56


# 4 Cluster dengan rentang waktu 2014-2021.
#### Membuat model prediksi konsumsi listrik harian (dalam satuan GWh) 

- electricity_consumption     =  konsumsi listrik harian (GWh)
- temperature_2m_max          =  suhu maksimum 2 meter
- temperature_2m_min          =  suhu minimum 2 meter
- apparent_temperature_max    =  suhu terasa maksimum
- apparent_temperature_min    =  suhu terasa minimum
- sunshine_duration           =  durasi sinar matahari
- daylight_duration           =  durasi siang hari
- wind_speed_10m_max          =  kecepatan angin maksimum di 10m
- wind_gusts_10m_max          =  hembusan angin maksimum di 10m
- wind_direction_10m_dominant =  arah angin dominan di 10m
- shortwave_radiation_sum     =  total radiasi gelombang pendek
- et0_fao_evapotranspiration  =  evapotranspirasi referensi FAO

In [None]:
#cek data duplikat
train.duplicated().sum()

0

In [None]:
#cek data missing values
train.isnull().sum().sum()

0

In [None]:
#cek tipe data
train.dtypes

ID                              object
date                            object
cluster_id                      object
electricity_consumption        float64
temperature_2m_max             float64
temperature_2m_min             float64
apparent_temperature_max       float64
apparent_temperature_min       float64
sunshine_duration              float64
daylight_duration              float64
wind_speed_10m_max             float64
wind_gusts_10m_max             float64
wind_direction_10m_dominant    float64
shortwave_radiation_sum        float64
et0_fao_evapotranspiration     float64
dtype: object

In [None]:
#ubah cluster id ke int agar bisa masuk ke model
train['cluster_id'] = train['cluster_id'].str.replace('cluster_', '')
train['cluster_id'] = train['cluster_id'].astype(int)

if 'date' in train.columns:
    train['date'] = pd.to_datetime(train['date'])
    train['year'] = train['date'].dt.year
    train['month'] = train['date'].dt.month
    train['day'] = train['date'].dt.day
    df = train.drop('date', axis=1)

In [220]:
train.describe()

Unnamed: 0,date,cluster_id,electricity_consumption,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,sunshine_duration,daylight_duration,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,year,month,day
count,11688,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0,11688.0
mean,2017-12-31 12:00:00,2.5,642.313328,18.733385,7.174786,17.215315,4.784608,29302.656241,44070.779592,23.273058,46.420115,202.335554,12.742257,2.428585,2017.500342,6.52293,15.729637
min,2014-01-01 00:00:00,1.0,207.472,-1.9,-10.8,-8.1,-15.4,0.0,29447.55,6.8,13.3,0.002855,0.333333,0.173333,2014.0,1.0,1.0
25%,2016-01-01 00:00:00,1.75,414.376,12.8,2.5,10.0,-1.1,19104.170833,35150.501667,17.4,35.3,134.384708,5.6,0.99,2016.0,4.0,8.0
50%,2017-12-31 12:00:00,2.5,602.672,18.2,7.1,16.4,4.4,29917.36,44185.056667,22.0,43.9,219.0,12.14,2.156667,2017.5,7.0,16.0
75%,2020-01-01 00:00:00,3.25,836.0345,24.3,12.1,24.0,10.8,40697.756667,52997.815833,28.0,54.7,281.609519,18.924167,3.616667,2020.0,10.0,23.0
max,2021-12-31 00:00:00,4.0,1658.348,40.9,23.2,42.6,23.8,53498.91,58501.42,63.9,126.7,360.0,29.983333,8.133333,2021.0,12.0,31.0
std,,1.118082,287.673473,7.504161,5.960263,9.085303,7.260336,13600.113712,9283.839471,7.993389,15.226372,100.697741,7.588077,1.607845,2.291236,3.448851,8.800469


In [221]:
#Boxplot Cek Outliers (ADA)
#for i in train.select_dtypes(include='number').columns:
#    plt.figure(figsize=(6,4))
#    sns.boxplot(x=train[i])
#    plt.title(f'Boxplot of {i}')
#    plt.show()

In [None]:
#mark data yang memiliki outlier dan ganti data yang outlier jadi data Q3 atau Q1 berdasarkan roundingannya
cols_with_outliers = [
    "temperature_2m_max",
    "temperature_2m_min",
    "apparent_temperature_max",
    "apparent_temperature_min",
    "sunshine_duration",
    "wind_speed_10m_max"
]

for col in cols_with_outliers:
    Q1 = train[col].quantile(0.25)
    Q3 = train[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    train[col] = train[col].clip(lower, upper)

In [223]:
#Boxplot cek outliers (Sudah Bersih)
#for i in train.select_dtypes(include='number').columns:
#    plt.figure(figsize=(6,4))
#    sns.boxplot(x=train[i])
#    plt.title(f'Boxplot of {i}')
    #plt.show()

In [224]:
#apakah hari = weekend
from datetime import date

train['date'] = pd.to_datetime(train['date'])
train['isWeekend'] = (train['date'].dt.dayofweek >= 5).astype(int)


def isWeekend(date):
  return date.weekday() >=5


def season(d):
  year = d.year
  spring_start = date(year, 3, 20)
  summer_start = date(year, 6, 21)
  autumn_start = date(year, 9, 23)
  winter_start = date(year, 12, 21)

  d_date = d.date()

  if spring_start <= d_date < summer_start:
    return 2 # Spring
  elif summer_start <= d_date < autumn_start:
    return 3 # Summer
  elif autumn_start <= d_date < winter_start:
    return 4 # Autumn
  else:
    return 1 # Winter
  
train['season'] = train['date'].apply(season)
train = train.drop('date', axis=1)

In [None]:
#cek rata rata electricity_consumption per cluster untuk menjawab nomor 4
cluster_1 = train.groupby("cluster_id")['electricity_consumption'].mean()

In [225]:
train.head()

Unnamed: 0,ID,cluster_id,electricity_consumption,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,sunshine_duration,daylight_duration,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,year,month,day,isWeekend,season
0,cluster_1_2014-01-01,1,358.032,10.8,4.2,5.5,0.4,53.003333,29787.533333,40.6,79.2,186.0,1.176667,0.483333,2014,1,1,0,1
1,cluster_2_2014-01-01,2,548.247,12.2,4.3,8.6,-0.4,8195.656667,30650.35,36.0,81.7,170.591118,3.383333,0.736667,2014,1,1,0,1
2,cluster_3_2014-01-01,3,758.303,12.9,-0.8,10.1,-4.9,16305.26,31547.686667,20.9,44.3,159.467752,3.88,0.716667,2014,1,1,0,1
3,cluster_4_2014-01-01,4,1072.077,10.8,4.7,6.7,0.6,9224.803333,30769.22,34.7,82.8,184.339753,3.153333,0.676667,2014,1,1,0,1
4,cluster_1_2014-01-02,1,386.908,10.7,7.0,6.6,3.3,22372.0,29850.226667,34.3,75.2,210.333465,3.64,0.78,2014,1,2,0,1


In [226]:
x_train = train.drop(columns=['electricity_consumption','ID'])
y_train = train['electricity_consumption']

In [227]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42
)

In [228]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

In [229]:
y_pred = model.predict(x_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print("RMSE di validation set:", rmse)

RMSE di validation set: 31.41633473353737


In [230]:
import joblib

cobaLAGI = 'cobaLAGI.joblib'
joblib.dump(model, cobaLAGI)

['cobaLAGI.joblib']