In [None]:
from pathlib import Path
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# Read file into DataFrame
file = Path("myopia.csv")
myopia = pd.read_csv(file)
myopia.head()

In [None]:
# Drop our target (myopic) column in order to not bias the unsupervised learning model
labels = myopia['MYOPIC']
myopia.drop('MYOPIC', axis=1).head()

In [None]:
# Standardize your dataset so that columns that contain larger values do not influence the outcome more than columns with smaller values.
from sklearn.preprocessing import StandardScaler

# standardize our data using scaler
scaler = StandardScaler()
myo_scaled = scaler.fit_transform(myopia)

In [None]:
# Apply Dimensionality Reduction
# 0.90 to preserve ~90% of explained variance
pca = PCA(n_components=0.90)
myo_pca = pca.fit_transform(myo_scaled)

# PCA converted to dataframe
df_myo_pca = pd.DataFrame(myo_pca)

# print sum of explained variance ratio
pca.explained_variance_ratio_.sum()

In [None]:
# Run t-SNE model on our PCA output (df_myo_pca)
tsne = TSNE(learning_rate=35)
tsne_features = tsne.fit_transform(df_myo_pca)

In [None]:
# shape of t-SNE model
tsne_features.shape

In [None]:
# Plot t-SNE output
plt.scatter(tsne_features[:,0],tsne_features[:,1], c=labels)
plt.show()

In [None]:
There appears to be distinct clusters in the t-SNE plot

# Perform a Cluster Analysis with K-means
inertia = []
k = list(range(1,11))

# for-loop for elbow plot
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_myo_pca)
    inertia.append(km.inertia_)
elbow = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow)

# plot elbow data
plt.plot(elbow_df['k'], elbow_df['inertia'])
plt.xticks(k)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# n_clusters from k=4 above
model = KMeans(n_clusters=4, random_state=0)
model.fit(df_myo_pca)
pred = model.predict(df_myo_pca)
df_myo_pca['predicted class'] = model.labels_
df_myo_pca.head()

In [None]:
# plot with n=4 clusters
plt.scatter(df_myo_pca[0], df_myo_pca[1], c=df_myo_pca['predicted class'])
plt.show()

In [None]:
# plot looks to be clustered better with k=3, the elbow could be said to be at 3 as well
model = KMeans(n_clusters=3, random_state=0)
model.fit(df_myo_pca)
pred = model.predict(df_myo_pca)
df_myo_pca['predicted class'] = model.labels_
plt.scatter(df_myo_pca[0], df_myo_pca[1], c=df_myo_pca['predicted class'])
plt.show()

##### Yes, the patients can be clustered. Three clusters seems to be the best fit for the model.