### 1. 범주형 데이터

In [1]:
# 데이터 가져오기"
import pandas as pd
mydata = pd.read_csv('https://stats.idre.ucla.edu/stat/data/binary.csv')
mydata

Unnamed: 0,admit,gre,gpa,rank
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.00,1
3,1,640,3.19,4
4,0,520,2.93,4
...,...,...,...,...
395,0,620,4.00,2
396,0,560,3.04,3
397,0,460,2.63,2
398,0,700,3.65,2


In [2]:
# 데이터 분할
X = mydata.iloc[:, 1:]
y = mydata.iloc[:, 0]

In [3]:
# 데이터 표준화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

In [4]:
import multiprocessing
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size = 0.25)  # 평가데이터 25%
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(300, 3) (100, 3) (300,) (100,)


In [5]:
# bagging & cross-validation
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import confusion_matrix

model_bag = BaggingClassifier(n_estimators = 500, max_samples = 100, bootstrap = True)
model_bag.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_bag,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)

bag_y_pred = model_bag.predict(X_test)

bag_confmat = confusion_matrix(y_test, bag_y_pred)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   10.0s finished


In [6]:
# gradient boosting & cross-validation
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix

model_gbm = GradientBoostingClassifier(n_estimators = 500, max_depth = 2)
model_gbm.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_gbm,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)
gbm_y_pred = model_gbm.predict(X_test)

gbm_confmat = confusion_matrix(y_test, gbm_y_pred)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.3s finished


In [7]:
# adaboost & cross-validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

model_ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), n_estimators = 500)
model_ada.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_ada,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)
ada_y_pred = model_ada.predict(X_test)

ada_confmat = confusion_matrix(y_test, ada_y_pred)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    7.5s finished


In [8]:
# random forest & cross-validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

model_rf = RandomForestClassifier(n_estimators = 500)
model_rf.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_rf,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)
rf_y_pred = model_rf.predict(X_test)

rf_confmat = confusion_matrix(y_test, rf_y_pred)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    6.4s finished


In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

print("bag accuracy : {}".format(accuracy_score(y_test, bag_y_pred)))
print("gbm accuracy : {}".format(accuracy_score(y_test, gbm_y_pred)))
print("ada accuracy : {}".format(accuracy_score(y_test, ada_y_pred)))
print("rf accuracy : {}".format(accuracy_score(y_test, rf_y_pred)))

print("bag confmat: \n{}".format(bag_confmat))
print("gbm confmat: \n{}".format(gbm_confmat))
print("ada confmat: \n{}".format(ada_confmat))
print("rf confmat: \n{}".format(rf_confmat))

bag accuracy : 0.71
gbm accuracy : 0.68
ada accuracy : 0.68
rf accuracy : 0.71
bag confmat: 
[[60  8]
 [21 11]]
gbm confmat: 
[[56 12]
 [20 12]]
ada confmat: 
[[53 15]
 [17 15]]
rf confmat: 
[[58 10]
 [19 13]]


### 2. 연속형 데이터

In [10]:
# 데이터 가져오기
import numpy as np
import pandas as pd
regdata = pd.read_csv('https://stats.idre.ucla.edu/wp-content/uploads/2019/02/elemapi2v2.csv')
regdata

Unnamed: 0,snum,dnum,api00,api99,growth,meals,ell,yr_rnd,mobility,acs_k3,...,col_grad,grad_sch,avg_ed,full,emer,enroll,mealcat,collcat,abv_hsg,lgenroll
1,906,41,693,600,93,67,9,0,11.0,16.0,...,0,0,,76,24,247,2,1,100,2.392697
2,889,41,570,501,69,92,21,0,33.0,15.0,...,0,0,,79,19,463,3,1,100,2.665581
3,887,41,546,472,74,97,29,0,36.0,17.0,...,0,0,,68,29,395,3,1,100,2.596597
4,876,41,571,487,84,90,27,0,27.0,20.0,...,9,0,1.91,87,11,418,3,1,64,2.621176
5,888,41,478,425,53,89,30,0,44.0,18.0,...,0,0,1.50,87,13,520,3,1,50,2.716003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,482,796,745,736,9,27,12,0,11.0,18.0,...,37,7,3.12,95,5,266,1,2,95,2.424882
397,489,796,720,678,42,34,8,0,20.0,19.0,...,26,6,2.88,85,7,461,1,3,92,2.663701
398,504,796,802,787,15,26,10,0,21.0,19.0,...,37,12,3.29,91,5,360,1,2,97,2.556303
399,488,796,539,424,115,98,12,0,18.0,20.0,...,1,0,2.06,93,7,301,3,1,90,2.478566


In [11]:
# 필요한 데이터 외에는 삭제
regdata = regdata.loc[:, ["api00", "enroll", "full", "col_grad", "emer"]]
regdata

Unnamed: 0,api00,enroll,full,col_grad,emer
1,693,247,76,0,24
2,570,463,79,0,19
3,546,395,68,0,29
4,571,418,87,9,11
5,478,520,87,0,13
...,...,...,...,...,...
396,745,266,95,37,5
397,720,461,85,26,7
398,802,360,91,37,5
399,539,301,93,1,7


In [12]:
# 데이터 분할
X = regdata.iloc[:, 1:]
y = regdata.iloc[:, 0]

In [13]:
# 데이터 표준화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)  # 평가데이터 25%
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(300, 4) (100, 4) (300,) (100,)


In [15]:
# bagging & cross-validation
from sklearn.ensemble import BaggingRegressor

model_bagr = BaggingRegressor(n_estimators = 500, max_samples = 100, bootstrap = True)
model_bagr.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_bagr,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)
bagr_y_pred = model_bagr.predict(X_test)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    3.6s finished


In [16]:
# gradient boosting & cross-validation
from sklearn.ensemble import GradientBoostingRegressor

model_gbmr = GradientBoostingRegressor(n_estimators = 500, max_depth = 2)
model_gbmr.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_gbmr,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)
gbmr_y_pred = model_gbmr.predict(X_test)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    1.0s finished


In [17]:
# adaboost & cross-validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

model_adar = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), n_estimators = 500)
model_adar.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_adar,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)
adar_y_pred = model_adar.predict(X_test)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   12.8s finished


In [18]:
# random forest & cross-validation
from sklearn.ensemble import RandomForestRegressor

model_rfr = RandomForestClassifier(n_estimators = 500)
model_rfr.fit(X_train, y_train)

# 5-fold cross validation
cross_validate(
    estimator = model_rfr,
    X = X, y = y,
    cv = 5,
    n_jobs = multiprocessing.cpu_count(),
    verbose = True
)
rfr_y_pred = model_rfr.predict(X_test)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:   10.6s finished


In [19]:
# RMSE
from sklearn.metrics import mean_squared_error
print("bag RMSE : {}".format(np.sqrt(mean_squared_error(y_test, bagr_y_pred))))
print("gbm RMSE : {}".format(np.sqrt(mean_squared_error(y_test, gbmr_y_pred))))
print("ada RMSE : {}".format(np.sqrt(mean_squared_error(y_test, adar_y_pred))))
print("rf RMSE : {}".format(np.sqrt(mean_squared_error(y_test, rfr_y_pred))))

bag RMSE : 75.95855817430977
gbm RMSE : 84.03933153642593
ada RMSE : 143.92758595905096
rf RMSE : 106.93741160136615


In [20]:
# MAE
from sklearn.metrics import mean_absolute_error
print("bag MAE : {}".format(mean_absolute_error(y_test, bagr_y_pred)))
print("gbm MAE : {}".format(mean_absolute_error(y_test, gbmr_y_pred)))
print("ada MAE : {}".format(mean_absolute_error(y_test, adar_y_pred)))
print("rf MAE : {}".format(mean_absolute_error(y_test, rfr_y_pred)))

bag MAE : 57.263239999999996
gbm MAE : 64.69063571783472
ada MAE : 113.85
rf MAE : 80.23


In [21]:
# MPE
def MPE(y_test, y_pred):
	return np.mean((y_test - y_pred) / y_test) * 100 
    
print("bag MPE : {}".format(MPE(y_test, bagr_y_pred)))
print("gbm MPE : {}".format(MPE(y_test, gbmr_y_pred)))
print("ada MPE : {}".format(MPE(y_test, adar_y_pred)))
print("rf MPE : {}".format(MPE(y_test, rfr_y_pred)))

bag MPE : -1.6423529841494657
gbm MPE : -0.01675337772971681
ada MPE : 10.039872150646753
rf MPE : -2.2710397312464554


In [22]:
# MAPE
def MAPE(y_test, y_pred):
	return np.mean(np.abs((y_test - y_pred) / y_test)) * 100 
    
print("bag MAPE : {}".format(MAPE(y_test, bagr_y_pred)))
print("gbm MAPE : {}".format(MAPE(y_test, gbmr_y_pred)))
print("ada MAPE : {}".format(MAPE(y_test, adar_y_pred)))
print("rf MAPE : {}".format(MAPE(y_test, rfr_y_pred)))

bag MAPE : 9.327557699132495
gbm MAPE : 10.391253484575385
ada MAPE : 16.622095934501072
rf MAPE : 13.06339480151376
