### 교차 검증 단순화
- scikit-learn의 model_selection 모듈 내에 모델 검증관련 기능 활용
- 교차 검증 데이터기반 검증 결과 처리

[1] 모듈 로딩 및 데이터 준비

In [9]:
# 모듈 로딩
import pandas as pd

# 생선 데이터 준비
fishDF = pd.read_csv('../data/fish.csv')

# 붓꽃 데이터 준비
irisDF = pd.read_csv('../data/iris.csv')

In [10]:
fishDF

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.5200,4.0200
1,Bream,290.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.7300,4.4555
4,Bream,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...
154,Smelt,12.2,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,14.3,15.2,2.8728,2.0672


In [11]:
irisDF

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


[2] 데이터 준비 => 피쳐 & 타겟 분리

In [12]:
# 타겟 : Weight, 피쳐 : Length Diagonal Height Width
fish_target = fishDF[fishDF.columns[1]]
fish_feature = fishDF[fishDF.columns[2:]]

In [13]:
# 타겟 : 4번 컬럼, 피쳐 : 0~3번 컬럼
iris_target = irisDF[irisDF.columns[4]]
iris_feature = irisDF[irisDF.columns[:4]]

[3] 데이터 전처리
- 피쳐 스케일링

In [14]:
# 학습용 테스트용 데이터 분리
from sklearn.model_selection import train_test_split

# 생선 데이터 => 학습용 데이터셋, 테스트용 데이터셋 ==> 회귀
fish_X_train, fish_X_test, fish_y_train, fish_y_test = train_test_split(fish_feature, fish_target,
                                                                        test_size=0.2,
                                                                        random_state=5)

# 붓꽃 데이터 => 학습용 데이터셋, 테스트용 데이터셋 ==> 분류
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_feature, iris_target,
                                                                        test_size=0.2,
                                                                        random_state=5,
                                                                        stratify=iris_target)

In [16]:
from sklearn.preprocessing import StandardScaler

# 생선 데이터셋 피쳐 스케일링
fish_scaler = StandardScaler()
fish_scaler.fit(fish_X_train)

scaled_fish_X_train = fish_scaler.transform(fish_X_train)
scaled_fish_X_test = fish_scaler.transform(fish_X_test)

In [17]:
# 붓꽃 데이터셋 피쳐 스케일링
iris_scaler = StandardScaler()
iris_scaler.fit(iris_X_train)

scaled_iris_X_train = iris_scaler.transform(iris_X_train)
scaled_iris_X_test = iris_scaler.transform(iris_X_test)

[4] 학습

[4-1] 생선 무게 예측 모델

In [20]:
# 모듈 로딩
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict

In [21]:
# 교차 검증으로 학습 진행
# => 준비 : 모델 인스턴스, 학습용 피쳐 데이터, 학습용 라벨 데이터
lr_model = LinearRegression()

In [40]:
# 학습/검증에 대한 평가 모든 결과에 대한 처리
result = cross_validate(lr_model,
                        scaled_fish_X_train, 
                        fish_y_train,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True,
                        return_estimator=True)

result

{'fit_time': array([0.00199127, 0.00099516, 0.0010612 , 0.00116205, 0.        ]),
 'score_time': array([0.00099659, 0.00099659, 0.00099659, 0.00114989, 0.00108147]),
 'estimator': [LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression(),
  LinearRegression()],
 'test_r2': array([0.92104683, 0.84385378, 0.88592423, 0.64671954, 0.79031905]),
 'train_r2': array([0.87426416, 0.88779401, 0.88061108, 0.90297504, 0.89833592]),
 'test_neg_mean_squared_error': array([ -8767.84902315, -17815.75093903, -12344.87825138, -22006.47049028,
        -39450.52608702]),
 'train_neg_mean_squared_error': array([-16078.44783606, -13972.57866943, -15268.42472495, -13223.98109532,
        -10586.01039978])}

In [41]:
resultDF = pd.DataFrame(result)
resultDF

Unnamed: 0,fit_time,score_time,estimator,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error
0,0.001991,0.000997,LinearRegression(),0.921047,0.874264,-8767.849023,-16078.447836
1,0.000995,0.000997,LinearRegression(),0.843854,0.887794,-17815.750939,-13972.578669
2,0.001061,0.000997,LinearRegression(),0.885924,0.880611,-12344.878251,-15268.424725
3,0.001162,0.00115,LinearRegression(),0.64672,0.902975,-22006.47049,-13223.981095
4,0.0,0.001081,LinearRegression(),0.790319,0.898336,-39450.526087,-10586.0104


In [42]:
best_model = resultDF.iloc[0]['estimator']
best_model.coef_, best_model.intercept_

(array([ 373.98470744, -159.77931033,   90.53431501,   50.22123874]),
 408.52250924970195)

In [43]:
### CV에 score만 추출
print(cross_val_score(lr_model, scaled_fish_X_train, fish_y_train))

[0.92104683 0.84385378 0.88592423 0.64671954 0.79031905]


In [44]:
## CV에 predict만 추출
print(cross_val_predict(lr_model, scaled_fish_X_train, fish_y_train))

[ 9.09792517e+01  9.85612151e+01  3.87029719e+02  1.13011547e+02
  6.81676563e+02  2.82456988e+02  5.34379642e+02  3.61848302e+02
  6.12934598e+02  1.70756130e+02  5.53222970e+02  1.69433076e+01
 -2.53895688e+01  8.14926155e+02  6.97225129e+01  3.38157931e+02
  4.76306355e+02  7.67659158e+02  6.55686457e+02  1.80300946e+02
  8.45315559e+02  2.92145322e+02  6.08539351e+02  9.02782406e+02
  6.99788981e+02  9.40316876e+02  7.47628344e+02  3.28419355e+02
  7.89622699e+02  9.09130831e+02 -1.98986854e+02  1.81089559e+02
  6.36731679e+02 -1.09209894e+02  3.57087822e+02  7.88250361e+02
  3.25180589e+02  6.56473977e+02 -2.37032025e+02  4.55882834e+01
  9.57130255e+01 -2.10830505e+02  1.28969696e+02 -2.21199132e+02
 -1.10282630e+02  6.39911566e+02  2.12288357e+02  2.41098815e+02
  2.61932359e+02 -2.58301758e+02  2.93250859e+01  8.87950700e+02
  2.46460034e+02  5.55564851e+02  6.71006008e+02  7.04637891e+02
  2.29677895e+02  8.49746634e+02  7.24031103e+02 -5.70994192e+01
  2.22728797e+02  9.41173

### 교차검증과 튜닝까지 한꺼번에 진행
- 단점 : 시간이 오래 걸림

In [46]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

In [57]:
# 학습 모델 인스턴스와 하이퍼파라미터
est = LogisticRegression(max_iter=10000, solver='liblinear')
params = {'penalty':['l1', 'l2'],}

In [58]:
gscv = GridSearchCV(est, param_grid=params, return_train_score=True)

gscv.fit(scaled_iris_X_train, iris_y_train)

In [59]:
cv_resultsDF = pd.DataFrame(gscv.cv_results_)
cv_resultsDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.0012,0.000404,0.000597,0.000488,l1,{'penalty': 'l1'},0.875,1.0,0.958333,0.958333,...,0.941667,0.042492,1,0.9375,0.927083,0.9375,0.947917,0.9375,0.9375,0.006588
1,0.000999,7e-06,0.0,0.0,l2,{'penalty': 'l2'},0.875,0.958333,0.958333,0.958333,...,0.908333,0.066667,2,0.90625,0.916667,0.916667,0.927083,0.90625,0.914583,0.007795


In [63]:
gscv.best_params_, gscv.best_index_, gscv.best_score_, gscv.best_estimator_

({'penalty': 'l1'},
 0,
 0.9416666666666668,
 LogisticRegression(max_iter=10000, penalty='l1', solver='liblinear'))

### 데이터에 적합한 모델 찾기

In [64]:
from sklearn.utils.discovery import all_estimators

In [69]:
models = all_estimators('classifier')

for model_name, model in models:
    try:
        print(model().fit(scaled_iris_X_train, iris_y_train))
    except Exception as e:
        print(e)

AdaBoostClassifier()
BaggingClassifier()
BernoulliNB()
CalibratedClassifierCV()
Negative values in data passed to CategoricalNB (input X)
__init__() missing 1 required positional argument: 'base_estimator'
Negative values in data passed to ComplementNB (input X)
DecisionTreeClassifier()
DummyClassifier()
ExtraTreeClassifier()
ExtraTreesClassifier()
GaussianNB()
GaussianProcessClassifier()
GradientBoostingClassifier()
HistGradientBoostingClassifier()
KNeighborsClassifier()
LabelPropagation()
LabelSpreading()
LinearDiscriminantAnalysis()
LinearSVC()
LogisticRegression()
LogisticRegressionCV()




MLPClassifier()
__init__() missing 1 required positional argument: 'estimator'
Negative values in data passed to MultinomialNB (input X)
NearestCentroid()
NuSVC()
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
__init__() missing 1 required positional argument: 'estimator'
PassiveAggressiveClassifier()
Perceptron()
QuadraticDiscriminantAnalysis()
RadiusNeighborsClassifier()
RandomForestClassifier()
RidgeClassifier()
RidgeClassifierCV()
SGDClassifier()
SVC()
__init__() missing 1 required positional argument: 'estimators'
__init__() missing 1 required positional argument: 'estimators'


