In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

#### Data Preparation

In [2]:

file_path = Path("./Resources/myopia.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [3]:
## Column names
# df.columns


## Remove all rows with null values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")




Column AGE has 0 null values
Column SPHEQ has 0 null values
Column AL has 0 null values
Column ACD has 0 null values
Column LT has 0 null values
Column VCD has 0 null values
Column SPORTHR has 0 null values
Column READHR has 0 null values
Column COMPHR has 0 null values
Column STUDYHR has 0 null values
Column TVHR has 0 null values
Column DIOPTERHR has 0 null values
Column MOMMY has 0 null values
Column DADMY has 0 null values
Column MYOPIC has 0 null values


In [4]:
# Remove any duplicate rows if any exist
# Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")


Duplicate entries: 0


In [5]:
df.dtypes

AGE            int64
SPHEQ        float64
AL           float64
ACD          float64
LT           float64
VCD          float64
SPORTHR        int64
READHR         int64
COMPHR         int64
STUDYHR        int64
TVHR           int64
DIOPTERHR      int64
MOMMY          int64
DADMY          int64
MYOPIC         int64
dtype: object

#### In order to use unsupervised learning algorithms, all the features should be numeric, and also, on similar scales. Since all of the columns are numeric, there are no transformatoins necessary on the data represented in the data frame.

In [6]:
 #Scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[['AGE', 'SPHEQ', 'AL', 'ACD', 'LT', 'VCD',
                                                    'SPORTHR', 'READHR', 'COMPHR', 'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY']])

In [7]:
df.columns

Index(['AGE', 'SPHEQ', 'AL', 'ACD', 'LT', 'VCD', 'SPORTHR', 'READHR', 'COMPHR',
       'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY', 'MYOPIC'],
      dtype='object')

In [8]:
# # Create a DataFrame with the transformed data
# new_df = pd.DataFrame(scaled_data, columns=df.columns[1:])
# new_df['MYOPIC'] = df['MYOPIC']
# new_df.head()

In [9]:
# But since we don't actually need the class columns for unsupervised learning (MOMMY, DADMY)... drop 
# Drop the columns
new_df = pd.DataFrame(scaled_data, columns=df.columns[1:])
new_df['MYOPIC'] = df['MYOPIC']
# new_df = df.drop(["MOMMY","DADMY"], axis='columns')
new_df.head(10)


Unnamed: 0,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,-0.420219,-1.363917,-0.892861,0.483784,-0.281443,-1.019792,4.150661,1.69745,-0.689311,-0.672996,0.184058,0.498304,0.987138,1
1,-0.420219,-0.308612,-0.17184,0.53591,-0.967997,-0.130763,-0.998898,-0.912062,-0.361875,-0.221409,-0.340932,-0.875088,0.987138,0
2,-0.420219,0.604386,-0.009977,-0.506628,-0.177812,0.215809,0.257092,-0.912062,-0.034439,-0.672996,0.184058,-0.750234,-1.01303,0
3,-0.420219,-0.441325,-0.436703,1.230936,0.456927,-0.974587,0.759488,2.676017,-0.689311,-0.672996,-0.865922,0.685585,-1.01303,1
4,-1.823978,-0.166306,1.167204,0.42297,-0.566427,1.180178,0.257092,-0.912062,-0.689311,-0.672996,-0.865922,-1.374503,0.987138,0
5,-0.420219,1.507791,-0.524993,-1.540479,0.094219,-0.025285,-0.245304,1.045072,-0.034439,-0.221409,1.759029,1.122573,-1.01303,0
6,-0.420219,-0.188691,-0.245412,-1.705547,0.728959,0.170603,0.005894,1.371261,-0.034439,-0.221409,-0.165935,0.623158,-1.01303,0
7,-0.420219,0.753088,-0.157125,0.666229,0.275574,-0.447197,0.005894,-0.912062,-0.689311,-0.672996,-0.165935,-1.124795,-1.01303,0
8,0.98354,0.951358,0.181316,-0.49794,-0.864366,0.54731,-0.998898,-0.912062,0.292997,-0.221409,-1.040919,-0.875088,-1.01303,0
9,-0.420219,0.273404,0.357892,-0.324184,1.00099,0.245945,2.266676,0.718883,-0.361875,-0.672996,0.184058,0.061316,-1.01303,0


In [10]:
# Saving cleaned data
file_path = Path("./Resources/myopic_cleaned.csv")
new_df.to_csv(file_path, index=False)