In [1]:
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plt.rcParams['figure.dpi'] = 300

golf_dataset = pd.read_csv("golf_dataset_mini_original_with_testset.csv")
df = golf_dataset.copy()

df

Unnamed: 0,Outlook_sunny,Outlook_overcast,Outlook_rain,Temperature,Humidity,Wind,Play
0,1,0,0,85.0,85.0,0,0
1,1,0,0,80.0,90.0,1,0
2,0,1,0,83.0,78.0,0,1
3,0,0,1,70.0,96.0,0,1
4,0,0,1,68.0,80.0,0,1
5,0,0,1,65.0,70.0,1,0
6,0,1,0,64.0,65.0,1,1
7,1,0,0,72.0,95.0,0,0
8,1,0,0,69.0,70.0,0,1
9,0,0,1,75.0,80.0,0,1


In [3]:
# Standardize float columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
float_cols = df.select_dtypes(include=['float64']).columns
df[float_cols] = scaler.fit_transform(df[float_cols])

df.head(15)

Unnamed: 0,Outlook_sunny,Outlook_overcast,Outlook_rain,Temperature,Humidity,Wind,Play
0,1,0,0,1.409219,0.432042,0,0
1,1,0,0,0.650409,0.923797,1,0
2,0,1,0,1.105695,-0.256415,0,1
3,0,0,1,-0.867212,1.513903,0,1
4,0,0,1,-1.170736,-0.059713,0,1
5,0,0,1,-1.626022,-1.043223,1,0
6,0,1,0,-1.777784,-1.534978,1,1
7,1,0,0,-0.563688,1.415552,0,0
8,1,0,0,-1.018974,-1.043223,0,1
9,0,0,1,-0.108401,-0.059713,0,1


In [4]:
from sklearn.model_selection import train_test_split

# Setting feature matrix X and target vector y
X, y = df.drop('Play', axis=1), df['Play']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

In [7]:
# ini membuat KNN dengan k = 3
from sklearn.neighbors import KNeighborsClassifier

## Select the Number of Neighbors ('k')
k = 3

In [8]:
## Choose a Distance Metric
distance_metric = 'euclidean'

## Trying to calculate distance between ID 0 and ID 1
print(np.linalg.norm(X_train.loc[0].values - X_train.loc[1].values).round(3))

1.348


In [9]:
# Initialize the k-NN Classifier
knn_clf = KNeighborsClassifier(n_neighbors=k, metric=distance_metric)

# "Train" the DummyClassifier (although no real training happens)
knn_clf.fit(X_train, y_train)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'euclidean'
,metric_params,
,n_jobs,


In [10]:
from scipy.spatial import distance

# Compute the distances from the first row of X_test to all rows in X_train
distances = distance.cdist(X_test.iloc[0:1], X_train, metric='euclidean')

# Create a DataFrame to display the distances
distance_df = pd.DataFrame({
    'Train_ID': X_train.index,
    'Distance': distances[0].round(2)
}).set_index('Train_ID')

distance_df.sort_values(by='Distance')

Unnamed: 0_level_0,Distance
Train_ID,Unnamed: 1_level_1
1,0.25
0,1.21
7,1.83
11,1.98
10,1.99
2,2.01
9,2.11
12,2.15
13,2.22
3,2.53


In [11]:
# Use the k-NN Classifier to make predictions
y_pred = knn_clf.predict(X_test)
print("Label     :",list(y_test))
print("Prediction:",list(y_pred))

Label     : [0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1]
Prediction: [np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1), np.int64(1)]


In [13]:
# Evaluation Phase
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {round(accuracy * 100, 4)}%")

Accuracy: 85.7143%


In [14]:
labels, predictions, accuracies = list(y_test), [], []

k_list = [3, 5, 7]
for k in k_list:
    knn_clf = KNeighborsClassifier(n_neighbors=k)
    knn_clf.fit(X_train, y_train)
    y_pred = knn_clf.predict(X_test)
    predictions.append(list(y_pred))
    accuracies.append(round(accuracy_score(y_test, y_pred) * 100, 4))

df_predictions = pd.DataFrame({'Label': labels})
for k, pred in zip(k_list, predictions):
    df_predictions[f'k = {k}'] = pred

df_accuracies = pd.DataFrame({'Accuracy ': accuracies}, index=[f'k = {k}' for k in k_list]).T

print(df_predictions)
print(df_accuracies)

    Label  k = 3  k = 5  k = 7
0       0      0      0      1
1       1      1      1      1
2       1      1      1      1
3       0      0      1      1
4       0      0      0      1
5       0      1      1      1
6       1      1      1      1
7       1      1      1      1
8       1      1      1      1
9       1      1      1      1
10      1      1      1      1
11      1      1      0      1
12      0      1      1      1
13      1      1      1      1
             k = 3    k = 5    k = 7
Accuracy   85.7143  71.4286  64.2857


In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

df = pd.get_dummies(df)
scaler = StandardScaler()
float_cols = df.select_dtypes(include=['float64']).columns
df[float_cols] = scaler.fit_transform(df[float_cols])

X, y = df.drop('Play', axis=1), df['Play']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, shuffle=False)

knn_clf = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_clf.fit(X_train, y_train)

y_pred = knn_clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.8571428571428571
