## 자전거대여량 예측모델

#### 0. 라이브러리 

In [1]:
# 데이터 처리 및 분석
import numpy as np
import pandas as pd

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns

# 전처리 및 피처 엔지니어링
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# 머신러닝 모델
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 평가 지표
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score

# 기타
import warnings
warnings.filterwarnings("ignore")

# 한글 시각화 (Windows 기준)
plt.rcParams['font.family'] = 'Malgun Gothic'

  import pkg_resources


### 1. 데이터 불러오기

In [None]:
train_df=pd.read_csv('./data/train.csv')
test_df=pd.read_csv('./data/test.csv')

train_df.head(10)
test_df.head(10)




Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1
5,2011-01-01 05:00:00,1,0,0,2,9.84,12.88,75,6.0032,0,1,1
6,2011-01-01 06:00:00,1,0,0,1,9.02,13.635,80,0.0,2,0,2
7,2011-01-01 07:00:00,1,0,0,1,8.2,12.88,86,0.0,1,2,3
8,2011-01-01 08:00:00,1,0,0,1,9.84,14.395,75,0.0,1,7,8
9,2011-01-01 09:00:00,1,0,0,1,13.12,17.425,76,0.0,8,6,14


In [16]:
train_df.columns.tolist()

['datetime',
 'season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'casual',
 'registered',
 'count']

In [17]:
test_df.columns.tolist()

['datetime',
 'season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed']

In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [19]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    6493 non-null   object 
 1   season      6493 non-null   int64  
 2   holiday     6493 non-null   int64  
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   int64  
 5   temp        6493 non-null   float64
 6   atemp       6493 non-null   float64
 7   humidity    6493 non-null   int64  
 8   windspeed   6493 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 456.7+ KB


In [20]:
train_df.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132
std,1.116174,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454
min,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0
50%,3.0,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0
75%,4.0,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0
max,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0


In [22]:
test_df.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
count,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0
mean,2.4933,0.029108,0.685815,1.436778,20.620607,24.012865,64.125212,12.631157
std,1.091258,0.168123,0.464226,0.64839,8.059583,8.782741,19.293391,8.250151
min,1.0,0.0,0.0,1.0,0.82,0.0,16.0,0.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,49.0,7.0015
50%,3.0,0.0,1.0,1.0,21.32,25.0,65.0,11.0014
75%,3.0,0.0,1.0,2.0,27.06,31.06,81.0,16.9979
max,4.0,1.0,1.0,4.0,40.18,50.0,100.0,55.9986


### 결측치 확인

In [25]:
(train_df.isnull().sum())

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [26]:
test_df.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
dtype: int64

#### # 범주형, 날짜 등 주요 컬럼 분포 확인

In [28]:
for col in ['season', 'holiday', 'workingday', 'weather']:
    print(f"\n[{col} unique]:", train_df[col].unique())
    print(f"[{col} value counts]:\n", train_df[col].value_counts())


[season unique]: [1 2 3 4]
[season value counts]:
 season
4    2734
2    2733
3    2733
1    2686
Name: count, dtype: int64

[holiday unique]: [0 1]
[holiday value counts]:
 holiday
0    10575
1      311
Name: count, dtype: int64

[workingday unique]: [0 1]
[workingday value counts]:
 workingday
1    7412
0    3474
Name: count, dtype: int64

[weather unique]: [1 2 3 4]
[weather value counts]:
 weather
1    7192
2    2834
3     859
4       1
Name: count, dtype: int64


## 2.데이터 전처리

#### 날짜/시간 파생 정보 생성

In [29]:
# datetime 컬럼을 datetime 타입으로 변환
train_df['datetime'] = pd.to_datetime(train_df['datetime'])
test_df['datetime'] = pd.to_datetime(test_df['datetime'])

# 연, 월, 일, 시간, 요일 파생
for df in [train_df, test_df]:
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['weekday'] = df['datetime'].dt.weekday

#### 불필요한 컬럼제거

 \- train: 'casual', 'registered', 'datetime' 제거

 \- test: 'datetime' 제거

 이유:  
 1. 'casual', 'registered' (train에만 존재)    
  이 두 컬럼은 train 데이터에만 존재함.   

 2. test 데이터에는 이 값이 아예 없음 → 예측 시 사용 불가  

3. 만약 train에서 casual, registered를 feature로 사용해서 학습하면   
모델은 test 데이터에 없는 정보에 의존하게 되고  
test set에서는 에러가 발생하거나,  
"데이터 누수(data leakage)" 문제가 생김  

결론: 실제 예측에 쓸 수 없는 정보이므로 반드시 제외해야 함  

In [30]:
drop_cols=['casual','registered','datetime']
X=train_df.drop(drop_cols+['count'],axis=1)
y=train_df['count']
X_test=test_df.drop(['datetime'],axis=1)

#### 범주형 변수 인코딩

In [31]:
# Label Encoding (트리계열, 카테고리 적을 때 선호)
from sklearn.preprocessing import LabelEncoder

cat_cols = ['season', 'weather', 'holiday', 'workingday', 'month', 'hour', 'weekday']
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])  # train/test 값 일치 주의!

#### 타겟 로그 변환

- 예측 분포가 정규분포(가우시안)에 더 가까워짐

In [32]:
import numpy as np
y_log = np.log1p(y)   # log(1 + y)

## 3. 학습/테스트 데이터 분리

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_log, y_test_log=train_test_split(X, y_log, 
                                                         test_size=0.2, random_state=42)

## 4.모델 선택 및 학습

In [39]:
from sklearn.metrics import mean_squared_error
import numpy as np

for name, model in models.items():
    # 예측 (로그 스케일)
    pred_log = model.predict(X_test.values)
    # 원래 스케일로 역변환
    pred = np.expm1(pred_log)
    y_true = np.expm1(y_test_log.values)
    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(y_true, pred))
    print(f'{name} RMSE: {rmse:.5f}')

RandomForest RMSE: 40.19271
GradientBoosting RMSE: 75.78957


In [38]:
from sklearn.metrics import r2_score
import numpy as np

for name, model in models.items():
    pred_log = model.predict(X_test.values)
    pred = np.expm1(pred_log)
    y_true = np.expm1(y_test_log.values)
    
    r2 = r2_score(y_true, pred)
    print(f'{name} R2 Score: {r2:.5f}')

RandomForest R2 Score: 0.95106
GradientBoosting R2 Score: 0.82597


In [40]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# 모델 준비
xgb = XGBRegressor(random_state=42, n_jobs=-1)
lgbm = LGBMRegressor(random_state=42, n_jobs=-1)

# XGBoost 학습 및 예측
xgb.fit(X_train.values, y_train_log.values.ravel())
xgb_pred_log = xgb.predict(X_test.values)
xgb_pred = np.expm1(xgb_pred_log)
y_true = np.expm1(y_test_log.values)
xgb_rmse = np.sqrt(mean_squared_error(y_true, xgb_pred))
print(f'XGBoost RMSE: {xgb_rmse:.5f}')

# LightGBM 학습 및 예측
lgbm.fit(X_train.values, y_train_log.values.ravel())
lgbm_pred_log = lgbm.predict(X_test.values)
lgbm_pred = np.expm1(lgbm_pred_log)
lgbm_rmse = np.sqrt(mean_squared_error(y_true, lgbm_pred))
print(f'LightGBM RMSE: {lgbm_rmse:.5f}')

XGBoost RMSE: 38.98941
LightGBM RMSE: 41.08071


In [41]:
from sklearn.metrics import r2_score
import numpy as np

# XGBoost R² 계산
xgb_pred_log = xgb.predict(X_test.values)
xgb_pred = np.expm1(xgb_pred_log)
y_true = np.expm1(y_test_log.values)
xgb_r2 = r2_score(y_true, xgb_pred)
print(f'XGBoost R²: {xgb_r2:.5f}')

# LightGBM R² 계산
lgbm_pred_log = lgbm.predict(X_test.values)
lgbm_pred = np.expm1(lgbm_pred_log)
lgbm_r2 = r2_score(y_true, lgbm_pred)
print(f'LightGBM R²: {lgbm_r2:.5f}')

XGBoost R²: 0.95394
LightGBM R²: 0.94887


하이퍼파라미터 ㄱㄱ