In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import lightgbm as lgb

In [2]:
df = pd.read_csv("공공/훈련데이터셋.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181408 entries, 0 to 181407
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   지역코드           181408 non-null  int64  
 1   최저기온(°C)       181408 non-null  float64
 2   3.0m 지중온도(°C)  181408 non-null  float64
 3   평균 현지기압(hPa)   181408 non-null  float64
 4   가조시간(hr)       181408 non-null  float64
 5   평균 상대습도(%)     181408 non-null  float64
 6   풍정합(100m)      181408 non-null  float64
 7   합계 소형증발량(mm)   181408 non-null  float64
 8   파워             181408 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 12.5 MB


In [3]:
target = "파워"
y = df[target]

X = df.drop(columns=[target])
X = X.select_dtypes(include="number")

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [5]:
model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

In [6]:
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1488
[LightGBM] [Info] Number of data points in the train set: 145126, number of used features: 8
[LightGBM] [Info] Start training from score 137895.361588


In [7]:
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 262027.507511188
R2: 0.4370066402370477


In [8]:
importance = pd.Series(
    model.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importance.head(10)

지역코드             4333
최저기온(°C)         1215
3.0m 지중온도(°C)     757
평균 현지기압(hPa)      727
풍정합(100m)         634
가조시간(hr)          547
평균 상대습도(%)        456
합계 소형증발량(mm)      331
dtype: int32

In [9]:
region_mean = (
    df.groupby("지역코드")["파워"]
      .mean()
      .rename("region_avg_power")
)

In [10]:
df = df.merge(
    region_mean,
    on="지역코드",
    how="left"
)

In [11]:
df[["지역코드", "파워", "region_avg_power"]].head()

Unnamed: 0,지역코드,파워,region_avg_power
0,1111010100,146294.6135,138564.825764
1,1111010100,175633.827,138564.825764
2,1111010100,156084.191,138564.825764
3,1111010100,177018.842,138564.825764
4,1111010100,158467.169,138564.825764


In [12]:
df["power_deviation"] = df["파워"] - df["region_avg_power"]

In [13]:
df[["지역코드", "파워", "region_avg_power","power_deviation"]].head()

Unnamed: 0,지역코드,파워,region_avg_power,power_deviation
0,1111010100,146294.6135,138564.825764,7729.787736
1,1111010100,175633.827,138564.825764,37069.001236
2,1111010100,156084.191,138564.825764,17519.365236
3,1111010100,177018.842,138564.825764,38454.016236
4,1111010100,158467.169,138564.825764,19902.343236


In [14]:
target = "power_deviation"
y = df[target]

X = df.drop(columns=["파워", "region_avg_power", target])
X = X.select_dtypes(include="number")

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [16]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

model = lgb.LGBMRegressor(
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1488
[LightGBM] [Info] Number of data points in the train set: 145126, number of used features: 8
[LightGBM] [Info] Start training from score 206.658275


In [17]:
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE (deviation): {rmse}")
print(f"R2 (deviation): {r2}")

RMSE (deviation): 196915.1774563123
R2 (deviation): 0.15752390891630752
