In [13]:
### Bagging+KNN 进行UCI glass数据集分类

import numpy as np
import pandas as pd
from time import time
from sklearn.utils import shuffle                                   # 打乱dataframe顺序专用
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
### 读取数据 ###

data = pd.read_csv("./datasets/glass-classification/glass.csv")
data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [3]:
### 打乱数据顺序 ####

data = shuffle(data)
data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
113,1.51892,13.46,3.83,1.26,72.55,0.57,8.21,0.0,0.14,2
150,1.51665,13.14,3.45,1.76,72.48,0.6,8.38,0.0,0.17,3
187,1.52315,13.44,3.34,1.23,72.38,0.6,8.83,0.0,0.0,7
151,1.52127,14.32,3.9,0.83,71.5,0.0,9.49,0.0,0.0,3
64,1.52172,13.48,3.74,0.9,72.01,0.18,9.61,0.0,0.07,1


In [4]:
### 查看数据集情况 ###

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214 entries, 113 to 212
Data columns (total 10 columns):
RI      214 non-null float64
Na      214 non-null float64
Mg      214 non-null float64
Al      214 non-null float64
Si      214 non-null float64
K       214 non-null float64
Ca      214 non-null float64
Ba      214 non-null float64
Fe      214 non-null float64
Type    214 non-null int64
dtypes: float64(9), int64(1)
memory usage: 18.4 KB


In [5]:
### 查看分类的个数 ###
data['Type'].value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [6]:
### 分割数据集为训练集与测试集 ###

data_array = np.array(data)
data_x = data_array[:, 0:9]
data_y = data_array[:, 9]
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y)

In [7]:
### 定义模型起始时间 ### 
start_time = time()
print(start_time)

1537255321.421


In [8]:
### Bagging调参 ###

params = {"n_estimators":range(5,50), "max_samples":np.arange(0.5, 1, 0.1), "max_features":np.arange(0.5, 1, 0.1)}

grid = GridSearchCV(BaggingClassifier(KNeighborsClassifier(n_neighbors=6)), param_grid=params, cv=5)

grid.fit(x_train, y_train)

end_time = time()
print("模型训练时间"+str(end_time-start_time))

模型训练时间596.7219998836517


In [10]:
### 输出最佳分类器 ###

grid.best_estimator_

BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=6, p=2,
           weights='uniform'),
         bootstrap=True, bootstrap_features=False, max_features=0.5,
         max_samples=0.89999999999999991, n_estimators=28, n_jobs=1,
         oob_score=False, random_state=None, verbose=0, warm_start=False)

In [17]:
### 查看该分类器的预测分值 ###

predict = grid.predict(x_test)

accuracy_score(y_test, predict)                          ## 准确率不佳，可能原因是数据集太少

0.62962962962962965