In [24]:
import pandas as pd

cols = ['sex','l','d','h','ww','sw','vw','slw','r']
labs = ['1','2','3','4','5']

data = pd.read_csv('abalone.data', header = None, names = cols)
print(data)

data = pd.DataFrame(data)

data = data.drop('sex', axis = 1)

data['r'] = pd.qcut(data['r'], q=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=labs)

print(data)

     sex      l      d      h      ww      sw      vw     slw   r
0      M  0.455  0.365  0.095  0.5140  0.2245  0.1010  0.1500  15
1      M  0.350  0.265  0.090  0.2255  0.0995  0.0485  0.0700   7
2      F  0.530  0.420  0.135  0.6770  0.2565  0.1415  0.2100   9
3      M  0.440  0.365  0.125  0.5160  0.2155  0.1140  0.1550  10
4      I  0.330  0.255  0.080  0.2050  0.0895  0.0395  0.0550   7
...   ..    ...    ...    ...     ...     ...     ...     ...  ..
4172   F  0.565  0.450  0.165  0.8870  0.3700  0.2390  0.2490  11
4173   M  0.590  0.440  0.135  0.9660  0.4390  0.2145  0.2605  10
4174   M  0.600  0.475  0.205  1.1760  0.5255  0.2875  0.3080   9
4175   F  0.625  0.485  0.150  1.0945  0.5310  0.2610  0.2960  10
4176   M  0.710  0.555  0.195  1.9485  0.9455  0.3765  0.4950  12

[4177 rows x 9 columns]
          l      d      h      ww      sw      vw     slw  r
0     0.455  0.365  0.095  0.5140  0.2245  0.1010  0.1500  5
1     0.350  0.265  0.090  0.2255  0.0995  0.0485  0.0700  1


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import cross_val_score

X = data[['l','d','h','ww','sw','vw','slw']]
y = np.ravel(data[['r']])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 120123)

k_values = [3, 5, 7, 9, 11]
for k in k_values:
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn_classifier, X_train_scaled, y_train, cv=5)
    print(f'k={k}, Mean Accuracy: {scores.mean()}')

k=3, Mean Accuracy: 0.4450708448573705
k=5, Mean Accuracy: 0.46392595973971334
k=7, Mean Accuracy: 0.4797937756773448
k=9, Mean Accuracy: 0.49176221547935517
k=11, Mean Accuracy: 0.49027102745182277


In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [28]:
from sklearn.model_selection import GridSearchCV

grid = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}
search = GridSearchCV(KNeighborsClassifier(), grid, cv = 5)
search.fit(X_train_scaled, y_train)

params = search.best_params_
knn = search.best_estimator_

print(params)
print(knn)

{'n_neighbors': 9, 'weights': 'uniform'}
KNeighborsClassifier(n_neighbors=9)


In [30]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

knn_classifier = KNeighborsClassifier(n_neighbors = 9)

bagging_classifier = BaggingClassifier(estimator = knn_classifier, n_estimators = 10, random_state = 120123)

bagging_classifier.fit(X_train, y_train)

y_pred = bagging_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Bagging Accuracy: {accuracy * 100:.2f}%")

Bagging Accuracy: 49.52%
