In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
# !pip install catboost
# !pip install pycaret
# !pip install optuna # 하이퍼 파라미터

In [3]:
# for "2. Data Loading"
import pandas as pd

# for "3-1. Feature Generation"
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

# for "4. Modeling with Pycaret"
from pycaret.regression import *

# for "5. Modeling with CatBoostRegressor"
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import mean_absolute_error

## 데이터 전처리

In [4]:
path = '/content/drive/MyDrive/데청캠 프로젝트/dataset/시군구별_인프라수_청년수.csv'
df = pd.read_csv(path, encoding='cp949')
df = df.drop(['Unnamed: 0'], axis=1)
df['청년인구'] = df['청년인구'].str.replace(",","")
df['청년인구'] = df['청년인구'].astype('float64')
df = df.drop([22,0,1])

In [5]:
df.columns

Index(['시군구', '청년인구', '병원', '사회스포츠', '대학교', '초중고', '공항', '소방서', '경찰서',
       '소형상업시설(음식점_카페_학원_편의점)', '대형상업시설', '숙박시설', '기차역', '행정기관', '법원, 교도소',
       '문화시설_영화/공연/미술관박물관', '버스터미널'],
      dtype='object')

In [6]:
need_list = [ '경찰서', '법원, 교도소','병원',  '숙박시설','초중고']
need_list2 = ['경찰서', '법원교도소','병원',  '숙박시설','초중고']

In [7]:
X = df[need_list]
y = df['소형상업시설(음식점_카페_학원_편의점)']
df_data = pd.concat([y,X] , axis=1)

In [8]:
std_scaler = StandardScaler()
df_data.loc[:,need_list] = std_scaler.fit_transform(df_data[need_list])
df_data.columns = ['소형상업',	'경찰서', '법원교도소','병원',  '숙박시설','초중고']
X = df_data[need_list2]
y = df_data['소형상업']

## pycaret

In [9]:
reg = setup(df_data, 
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            train_size = 0.999,  # 우리는 전체 데이터를 학습해 test를 예측하는게 목표이기 때문에, 0.999로 설정한다.
            target = '소형상업', 
            silent = True, # 엔터를 누르기 귀찮다. 궁금하면 풀어보세요
            use_gpu = False, # GPU가 있으면 사용하세요 (Cat BOost 속도 향상)
            numeric_features=list(df_data.drop(columns = ['소형상업']).columns), # 모든 변수가 숫자로써의 의미가 있다.
            session_id = 2021,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,소형상업
2,Original Data,"(72, 6)"
3,Missing Values,False
4,Numeric Features,5
5,Categorical Features,0
6,Transformed Train Set,"(71, 5)"
7,Transformed Test Set,"(1, 5)"
8,Shuffle Train-Test,True
9,Stratify Train-Test,False


In [10]:
top5 = compare_models(n_select = 5, sort = 'MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,608.8419,623032.2,757.015,0.7646,0.1851,0.1546,0.365
huber,Huber Regressor,613.3929,682166.2,785.2898,0.7157,0.209,0.1787,0.02
ridge,Ridge Regression,615.689,693215.5,794.532,0.7149,0.2079,0.1768,0.013
br,Bayesian Ridge,615.8554,695602.9,795.9322,0.7143,0.2085,0.177,0.014
lasso,Lasso Regression,616.4326,691066.3,792.7912,0.7136,0.2066,0.1764,0.014
lar,Least Angle Regression,616.4484,691399.0,792.9321,0.7134,0.2067,0.1765,0.013
lr,Linear Regression,616.4485,691399.1,792.9321,0.7134,0.2067,0.1765,0.294
llar,Lasso Least Angle Regression,616.6437,690754.6,792.9357,0.7142,0.2064,0.1758,0.015
ada,AdaBoost Regressor,648.1806,829228.2,841.6489,0.6939,0.2078,0.1667,0.078
par,Passive Aggressive Regressor,648.2456,764658.7,827.1824,0.6893,0.2208,0.1893,0.015


In [11]:
models = []
for m in top5:
    models.append(tune_model(m, 
                             optimize = 'MAE', 
                             choose_better = True,
                            n_iter = 10))

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,876.2598,1304785.0,1142.272,0.2794,0.2207,0.1756
1,531.7742,388742.3,623.492,0.9093,0.1295,0.1111
2,571.2045,516827.6,718.9072,0.7942,0.1267,0.1066
3,582.5824,650623.1,806.6121,0.6951,0.1031,0.0822
4,555.3464,421378.0,649.1364,0.9422,0.2427,0.1923
5,354.9653,172742.4,415.6229,0.9192,0.1357,0.111
6,486.615,320708.7,566.3115,0.4538,0.1461,0.1302
7,862.5338,1525651.0,1235.1724,0.6315,0.1812,0.1279
8,652.3373,813568.1,901.9801,0.7179,0.5229,0.5042
9,589.8383,614234.9,783.7314,0.8822,0.245,0.1851


In [12]:
voting = blend_models(models, optimize = 'MAE')
voting = tune_model(voting, 
                 optimize = 'MAE', 
                 choose_better = True,
                 n_iter = 10)

voting = finalize_model(voting)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,842.2175,1206854.0,1098.569,0.3334,0.2122,0.1675
1,515.7951,344858.9,587.2468,0.9196,0.1174,0.1024
2,559.312,407026.1,637.9859,0.8379,0.1144,0.1042
3,481.5217,393808.8,627.5418,0.8154,0.0851,0.0704
4,563.5134,421674.3,649.3645,0.9422,0.2339,0.1934
5,353.6466,176333.9,419.9213,0.9175,0.1229,0.1034
6,468.5659,292925.6,541.226,0.5011,0.1416,0.1258
7,818.5914,1429104.0,1195.4513,0.6548,0.1737,0.119
8,627.1063,727913.7,853.1786,0.7476,0.4901,0.4509
9,557.2065,679160.4,824.1119,0.8697,0.2649,0.1967


In [13]:
voting.weights

[0.76, 0.48000000000000004, 0.47000000000000003, 0.53, 0.21000000000000002]

In [14]:
layer1_pred = voting.predict(df_data.drop(columns = ['소형상업']))
pred = layer1_pred.round(0)
pred

array([3930., 4312., 5040., 5085., 4732., 5478., 4051., 3698., 6714.,
       5975., 4285., 6854., 5456., 7702., 5523., 3600., 6766., 4712.,
       6135., 8739., 9226., 6540., 1958., 1984., 2202., 2110., 7241.,
       4805., 4111., 4195., 7220., 4761., 3996., 2721., 3713., 3490.,
       3432., 3284., 3843., 5924., 3268., 2813., 6830., 7521., 8998.,
       4151., 3667., 1449., 5749., 5292., 7422., 6940., 4197., 6940.,
       2915., 2015., 3205., 6338., 4046., 8071., 7020., 4268., 4839.,
       9064., 6998., 3401., 3613., 7095., 2945., 3519., 4588., 5943.])

In [15]:
y_set = np.array(y.to_list())
y_set

array([5056, 4562, 5609, 4902, 4410, 5371, 3806, 3429, 5218, 5616, 4739,
       8638, 4716, 7638, 5565, 3845, 7384, 4143, 6461, 8169, 8807, 6343,
       2058, 1663, 1921, 1906, 6963, 4488, 4163, 3471, 7325, 4644, 3960,
       2951, 3591, 3796, 3051, 3563, 4209, 6397, 3395, 3078, 7442, 7919,
       8913, 4055, 4362,  990, 5748, 5676, 7520, 6709, 3803, 7403, 1998,
        733, 3261, 6949, 3509, 7582, 7557, 4039, 4330, 9410, 7758, 3603,
       3757, 7460, 3107, 3492, 4586, 5922])

In [16]:
voting.score(X,y)

0.9386304491313779

In [17]:
MSE = ((y_set - pred)**2).sum()/len(y_set)
MSE

252488.11111111112

In [18]:
RMSE = np.sqrt(MSE)
RMSE

502.48195103019486

## 교차검증을 사용한 모델평가

In [19]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=voting, # 학습된 모델
                         X=X, 
                         y=y,
                         scoring="neg_mean_squared_error",
                         cv=5)

rmse_scores = np.sqrt(-scores)

In [20]:
rmse_scores.mean()

762.483862455383

In [21]:
def display_scores(scores):
    print("점수:", scores)
    print("평균:", scores.mean())
    print("표준 편차:", scores.std())

In [22]:
display_scores(rmse_scores)

점수: [1115.98334466  621.261761    460.02375274 1057.21171641  557.93873746]
평균: 762.483862455383
표준 편차: 270.21876318477007


## **Bayesian Ridge**

In [23]:
from sklearn.linear_model import BayesianRidge, LinearRegression

In [24]:
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None, compute_score=True,
              copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06,
              lambda_init=None, n_iter=300, normalize=False, tol=0.001,
              verbose=False)

In [25]:
pred = clf.predict(X)

In [26]:
MSE = ((y_set - pred)**2).sum()/len(y_set)
MSE

530476.3538140272

In [27]:
RMSE = np.sqrt(MSE)
RMSE

728.3380765922012

In [28]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(estimator=clf, # 학습된 모델
                         X=X, 
                         y=y,
                         scoring="neg_mean_squared_error",
                         cv=5)

rmse_scores = np.sqrt(-scores)

In [29]:
rmse_scores.mean()

857.4901461630667

In [31]:
clf.score(X,y)

0.8710591516062062

pycaret의 rmse: 500 교차검증 : 762 /  Bayesian Ridge의 rmse : 728 교차검증 : 857

In [34]:
  %%shell
 jupyter nbconvert --to html /content/소형상업시설예측.ipynb

[NbConvertApp] Converting notebook /content/소형상업시설예측.ipynb to html
[NbConvertApp] Writing 342727 bytes to /content/소형상업시설예측.html


