## Step 1: Prepare the Data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

###  Step 1: Load the MyOpia dataset

In [2]:
# Loading the preprocessed MyOpia CSV file
file_path = Path("Resource/myopia.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [3]:
# Column names
df.columns

Index(['AGE', 'SPHEQ', 'AL', 'ACD', 'LT', 'VCD', 'SPORTHR', 'READHR', 'COMPHR',
       'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY', 'MYOPIC'],
      dtype='object')

In [4]:
# There were 81 myopica (1) children samples and 537 (0) non-myopic children samples

df["MYOPIC"].value_counts()

0    537
1     81
Name: MYOPIC, dtype: int64

#### Preprocess the data

In [5]:
# Split the DataFrame into data and target

y = df["MYOPIC"].values
X = df.drop("MYOPIC", axis=1)

In [6]:
# Split the data into two groups, the training and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
# Create a scaler to standardize the data

scaler = StandardScaler()

In [8]:
# Fit the X_train data to the standard scaler

scaler.fit(X_train)

StandardScaler()

In [9]:
# Transform X_train and X_test data
# Note that the scaler used to transform X_train and X_test was trained on X_train set

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Create predictions with KNN

In [11]:
# Instantiate KNN model and make predictions

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

In [12]:
# Access the accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.8709677419354839