In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pandas as pd



In [2]:
df = pd.read_excel('data/Pumpkin_Seeds_Dataset.xlsx')
df.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Convex_Area,Equiv_Diameter,Eccentricity,Solidity,Extent,Roundness,Aspect_Ration,Compactness,Class
0,56276,888.242,326.1485,220.2388,56831,267.6805,0.7376,0.9902,0.7453,0.8963,1.4809,0.8207,Çerçevelik
1,76631,1068.146,417.1932,234.2289,77280,312.3614,0.8275,0.9916,0.7151,0.844,1.7811,0.7487,Çerçevelik
2,71623,1082.987,435.8328,211.0457,72663,301.9822,0.8749,0.9857,0.74,0.7674,2.0651,0.6929,Çerçevelik
3,66458,992.051,381.5638,222.5322,67118,290.8899,0.8123,0.9902,0.7396,0.8486,1.7146,0.7624,Çerçevelik
4,66107,998.146,383.8883,220.4545,67117,290.1207,0.8187,0.985,0.6752,0.8338,1.7413,0.7557,Çerçevelik


In [3]:
df['Class'] = pd.Categorical(df['Class'])

In [4]:
df['Class'].unique()

['Çerçevelik', 'Ürgüp Sivrisi']
Categories (2, object): ['Çerçevelik', 'Ürgüp Sivrisi']

In [5]:
X, y = df.drop(columns=['Class']), df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [6]:
k = len(df['Class'].unique())
knn = KNeighborsClassifier(n_neighbors=k)


knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)



In [7]:

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')



Accuracy: 64.40%


In [8]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,Çerçevelik,Ürgüp Sivrisi
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Çerçevelik,219,32
Ürgüp Sivrisi,146,103


In [9]:
# print all scores accuracy, precision, recall, f1
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


               precision    recall  f1-score   support

   Çerçevelik       0.60      0.87      0.71       251
Ürgüp Sivrisi       0.76      0.41      0.54       249

     accuracy                           0.64       500
    macro avg       0.68      0.64      0.62       500
 weighted avg       0.68      0.64      0.62       500



### The accuracy is not that good. I suspect that it is because we pass all columns to the model. 
We need to try finding the subset of columns that yields the best accuracy. That's when SequentialFeatureSelector comes in to play.



In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier(n_neighbors=2)

sfs = SequentialFeatureSelector(knn, n_features_to_select=5, direction='forward')
sfs.fit(X_train, y_train)


selected_features = sfs.get_support(indices=True)

knn.fit(X_train.iloc[:, selected_features], y_train)
y_pred = knn.predict(X_test.iloc[:, selected_features])

print("Accuracy with selected features:", accuracy_score(y_test, y_pred))

Accuracy with selected features: 0.82


In [21]:
# print selected features
print(X.columns[selected_features])
selected_features_name = X.columns[selected_features]


Index(['Eccentricity', 'Solidity', 'Extent', 'Roundness', 'Compactness'], dtype='object')


['Eccentricity', 'Solidity', 'Extent', 'Roundness', 'Compactness']

### Try again with the selected features above

In [22]:

df = pd.read_excel('data/Pumpkin_Seeds_Dataset.xlsx')
df = df[selected_features_name.to_list() + ['Class']]
df['Class'] = pd.Categorical(df['Class'])
X, y = df.drop(columns=['Class']), df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

k = len(df['Class'].unique())
knn = KNeighborsClassifier(n_neighbors=k)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)



In [23]:

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')




Accuracy: 82.00%


## Accuracy goes up to 82%

In [24]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,Çerçevelik,Ürgüp Sivrisi
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Çerçevelik,232,19
Ürgüp Sivrisi,71,178


In [25]:
# print all scores accuracy, precision, recall, f1
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


               precision    recall  f1-score   support

   Çerçevelik       0.77      0.92      0.84       251
Ürgüp Sivrisi       0.90      0.71      0.80       249

     accuracy                           0.82       500
    macro avg       0.83      0.82      0.82       500
 weighted avg       0.83      0.82      0.82       500

