In [None]:
# Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import os

In [None]:
# Import the data
file_path = Path("myopia.csv")
df = pd.read_csv(file_path)
df.head()

### Preprocess the data

In [None]:
# Remove "MYOPIC" column so that clusters are not provided
del df["MYOPIC"]

In [None]:
# Display clean dataframe
df.head()

In [None]:
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[["AGE", "SPHEQ", "AL", "ACD", "LT", "VCD", "SPORTHR",
                                      "READHR", "COMPHR", "STUDYHR", "TVHR", "DIOPTERHR",
                                      "MOMMY", "DADMY"]])

In [None]:
# Create a DataFrame with the transformed data
scaled_df =  pd.DataFrame(scaled_data, columns=df.columns)
scaled_df.head()

### Apply dimensionality reduction

In [None]:
# Initialize PCA model
pca = PCA(n_components=0.9)

In [None]:
# Get 90% of principal components for the data
myopia_pca_df = pca.fit_transform(scaled_df)

In [None]:
# How did the number of features change?
myopia_pca_df.shape

In [None]:
# Fetch the explained variance
pca.explained_variance_ratio_

In [None]:
# Initialize t-SNE model
tsne = TSNE(learning_rate=250)

In [None]:
# Reduce dimensions
tsne_features = tsne.fit_transform(myopia_pca_df)

In [None]:
# The dataset now has 2 columns
tsne_features.shape

In [None]:
# Create a scatter plot for the dataset

# The first column of transformed features
scaled_df['x'] = tsne_features[:,0]

# The second column of transformed features
scaled_df['y'] = tsne_features[:,1]

In [None]:
# Visualize the clusters
plt.scatter(scaled_df['x'], scaled_df['y'])
plt.show()

### Perform cluster analysis with K-means

In [None]:
# Identify the number of clusters
inertia = []

# List k values
k = list(range(1,10))

# Calculate the inertia for the range of k values by finding the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(scaled_df)
    inertia.append(km.inertia_)
    
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [None]:
# Plot the elbow curve to find the best candidate(s) for k
plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,10))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

### Recommendation

In [None]:
The elbow curve above indicates patients can be clustered into 3 or 5 groups.
Based on the scatter plot, 5 clusters are distinct.