In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
data = pd.read_csv('./data/otto_train.csv')
data.head()

Unnamed: 0,id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93,target
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,Class_1
1,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
2,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Class_1
3,4,1,0,0,1,6,1,5,0,0,...,0,1,2,0,0,0,0,0,0,Class_1
4,5,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,Class_1


In [6]:
"""
id: 고유 아이디
feat_1 ~ 93 : 설명변수
target : 타겟변수 (Class 1-9)
"""

'\nid: 고유 아이디\nfeat_1 ~ 93 : 설명변수\ntarget : 타겟변수 (Class 1-9)\n'

In [7]:
nCar = data.shape[0]
nVar = data.shape[1]
print(nCar)
print(nVar)

61878
95


### 무의하다고 판단되는 변수 제거

In [8]:
data = data.drop(['id'], axis=1)

### 타겟 변수의 문자열을 숫자로 변환

In [9]:
mapping_dict = {
    'Class_1':1,
    'Class_2':2, 
    'Class_3':3, 
    'Class_4':4,
    'Class_5':5,
    'Class_6':6, 
    'Class_7':7, 
    'Class_8':8,
    'Class_9':9
}

In [10]:
after_mapping_target = data['target'].apply(lambda x: mapping_dict[x])

### 설명변수와 타겟변수 분리, 학습/평가데이터 분리

In [11]:
feature_columns = list(data.columns.difference(['target']))
X = data[feature_columns]
y = after_mapping_target

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(49502, 93) (12376, 93) (49502,) (12376,)


### XGBoost 모형 적합 후 검증

In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.2.1-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.1


In [17]:
from sklearn.metrics import accuracy_score
import xgboost as xgb
import time

start = time.time() # 처리시간 측정

# 학습/평가데이터를 XGBoost 모델에 맞게 변환
xgb_dtrain = xgb.DMatrix(data = train_x, label = train_y)
xgb_dtest = xgb.DMatrix(data = test_x)

xgb_param = {'max_depth': 10, 
             'learning_rate': 0.01,
             'n_estimators': 100, 
             'objective': 'multi:softmax',
             'num_class': len(set(train_y))+1}

In [19]:
xgb_model = xgb.train(params = xgb_param, dtrain = xgb_dtrain)
xgb_model_predict = xgb_model.predict(xgb_dtest)

print("Accuracy: %.2f" % (accuracy_score(test_y, xgb_model_predict) * 100), "%")
print("Time: %.2f" % (time.time() - start), "seconds")

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 76.67 %
Time: 13.55 seconds


In [20]:
xgb_model_predict

array([5., 3., 6., ..., 9., 2., 7.], dtype=float32)

### LightGBM

In [21]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.0.0-py2.py3-none-win_amd64.whl (737 kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.0.0


In [24]:
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import time

start = time.time() # 처리시간 측정

# 학습/평가데이터를 XGBoost 모델에 맞게 변환
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y)

lgb_param = {'max_depth': 10, 
             'learning_rate': 0.01,
             'n_estimators': 100, 
             'objective': 'multiclass',
             'num_class': len(set(train_y))+1}

In [25]:
lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain)
lgb_model_predict = np.argmax(lgb_model.predict(test_x), axis=1)

print("Accuracy: %.2f" % (accuracy_score(test_y, lgb_model_predict) * 100), "%")
print("Time: %.2f" % (time.time() - start), "seconds")



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3110
[LightGBM] [Info] Number of data points in the train set: 49502, number of used features: 93
[LightGBM] [Info] Start training from score -34.538776
[LightGBM] [Info] Start training from score -3.476745
[LightGBM] [Info] Start training from score -1.341381
[LightGBM] [Info] Start training from score -2.039019
[LightGBM] [Info] Start training from score -3.135151
[LightGBM] [Info] Start training from score -3.125444
[LightGBM] [Info] Start training from score -1.481556
[LightGBM] [Info] Start training from score -3.074772
[LightGBM] [Info] Start training from score -1.986562
[LightGBM] [Info] Start training from score -2.533374
Accuracy: 76.28 %
Time: 6.36 seconds


In [26]:
lgb_model_predict

array([2, 7, 6, ..., 9, 2, 7], dtype=int64)

### LightGBM 이용해서 집값 예측해보기

In [38]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import time
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

In [39]:
# 데이터 불러오기
data = pd.read_csv('./data/kc_house_data.csv')
data = data.drop(['id', 'date', 'zipcode', 'lat', 'long'], axis=1)
data.head()

Unnamed: 0,price,bedrooms,bathrooms,floors,waterfront,condition,grade,yr_built,yr_renovated
0,221900.0,3,1.0,1.0,0,3,7,1955,0
1,538000.0,3,2.25,2.0,0,3,7,1951,1991
2,180000.0,2,1.0,1.0,0,3,6,1933,0
3,604000.0,4,3.0,1.0,0,5,7,1965,0
4,510000.0,3,2.0,1.0,0,3,8,1987,0


In [40]:
feature_columns = list(data.columns.difference(['price']))

X = data[feature_columns]
y = data['price'] # -> Regression 문제

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42) # 7:3 으로 학습/평가데이터 분리
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape) #3 데이터 차원 확인

(15129, 8) (6484, 8) (15129,) (6484,)


In [49]:
start = time.time() # 처리시간 측정

# 학습/평가데이터를 XGBoost 모델에 맞게 변환
lgb_dtrain = lgb.Dataset(data = train_x, label = train_y)

lgb_param = {'max_depth': 10, 
             'learning_rate': 0.01,
             'n_estimators': 100, 
             'objective': 'regression'} #  Regression 문제이므로 목적함수 변경

lgb_model = lgb.train(params = lgb_param, train_set = lgb_dtrain)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 8
[LightGBM] [Info] Start training from score 537729.263666




In [50]:
lgb_model_predict = lgb_model.predict(test_x)

In [51]:
lgb_model_predict = lgb_model.predict(test_x)

print("MSE: %.2f" % sqrt(mean_squared_error(lgb_model_predict, test_y))) # RMSE 값
print("Time: %.2f" % (time.time() - start), "seconds")

MSE: 249111.20
Time: 0.73 seconds


In [None]:
"""
=> Bagging 으로 집값 예측한 결과와 비교해보면 LightBGM 으로 예측한 결과가 훨씬 성능이 좋은것을 확인할 수 있음
""" 