# KNN - clustering to predict the age of the abalone

In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
df = pd.read_csv("../data/train.csv", index_col='id')
df.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [3]:
df.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1',
       'Whole weight.2', 'Shell weight', 'Rings'],
      dtype='object')

In [4]:
# standardize the features and lebel the "Sex" features
df['Sex'] = df['Sex'].map({'F': 1, 'I': 2, 'M': 0})

# standardize the features
scaler = StandardScaler()
scaler.fit(df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])
df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']] = scaler.transform(df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])

df.head()

Unnamed: 0_level_0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,0.278317,0.288912,0.382451,-0.038314,-0.060061,-0.227155,0.108309,11
1,1,0.955044,0.900996,0.250897,0.745005,0.573416,1.061143,0.722736,11
2,2,-3.020727,-2.975535,-2.906386,-1.678148,-1.640084,-1.649238,-1.69657,6
3,0,0.658976,0.747975,0.382451,0.27414,0.16985,0.357534,0.185113,10
4,2,0.320613,0.237905,-0.143763,-0.015371,0.140499,-0.09337,-0.218105,9


In [5]:
df['Rings'].unique().shape

(28,)

In [6]:
# spliting the dataset
X_train, X_test, y_train, y_test = train_test_split(df[['Sex','Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']], df['Rings'], test_size=0.2, random_state=42)

In [7]:
# now let's train the classifier model
knn = KNeighborsClassifier(n_neighbors=28)

knn.fit(df[['Sex','Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']], df['Rings'])

y_pred = knn.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score:", accuracy)
print("Classification report:\n", classification_report(y_test, y_pred))

Accuracy score: 0.4031893174419246
Classification report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         8
           3       0.33      0.01      0.03        77
           4       0.53      0.55      0.54       284
           5       0.47      0.56      0.51       596
           6       0.47      0.46      0.46      1088
           7       0.46      0.53      0.49      1781
           8       0.49      0.50      0.50      2947
           9       0.41      0.61      0.49      3482
          10       0.32      0.34      0.33      2454
          11       0.35      0.35      0.35      1636
          12       0.22      0.07      0.10       965
          13       0.23      0.21      0.22       786
          14       0.22      0.08      0.12       519
          15       0.14      0.04      0.06       416
          16       0.24      0.13      0.16       287
          17       0.3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# let's store this model for future use
import pickle 

with open("../trained_models/knn_clustering2.pkl", "wb") as f:
    pickle.dump(knn, f)

In [9]:
# # let's see the gradientboosting model accuracy
# import pickle
# with open("../trained_models/gradientboosting1.pkl", "rb") as f:
#     gbm = pickle.load(f)

In [10]:
# y_pred = gbm.predict(X_test)


# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy score:", accuracy)
# print("Classification report:\n", classification_report(y_test, y_pred))

In [11]:
# let's submit knn result to the kaggle competition

test_df = pd.read_csv("../data/test.csv", index_col="id")
test_df['Sex'] = test_df['Sex'].map({'F': 1, 'I': 2, 'M': 0})

test_df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']] = scaler.transform(test_df[['Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])

test_df['Rings'] = knn.predict(test_df[['Sex','Length', 'Diameter', 'Height', 'Whole weight', 'Whole weight.1', 'Whole weight.2', 'Shell weight']])

sub = pd.DataFrame({'id': test_df.index, 'Rings': test_df['Rings']})
sub.to_csv("../submission/knn_clustering_submission.csv", index=False)

In [12]:
# submitting to kaggle

!kaggle competitions submit -c playground-series-s4e4 -f ../submission/knn_clustering_submission.csv -m "knn clustering"

Successfully submitted to Regression with an Abalone Dataset



  0%|          | 0.00/600k [00:00<?, ?B/s]
  1%|▏         | 8.00k/600k [00:00<01:01, 9.79kB/s]
100%|██████████| 600k/600k [00:03<00:00, 168kB/s]  


In [13]:
# # let's measure the features importance
# from sklearn.inspection import permutation_importance


# # train 
# knn = KNeighborsClassifier()
# knn.fit(X_train, y_train)

# # Compute permutation feature importances
# result = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=42)

# # Get feature importances
# feature_importances = result.importances_mean

# # Plot feature importances
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 6))
# plt.bar(range(len(feature_importances)), feature_importances)
# plt.xticks(range(len(feature_importances)), X_test.columns, rotation=45)
# plt.xlabel('Feature')
# plt.ylabel('Importance')
# plt.title('Permutation Feature Importances for KNN')
# plt.show()
