In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

### Part 1: Prepare the Data

In [None]:
# Read myopia.csv into a Pandas DataFrame
myopia_df = pd.read_csv("myopia.csv")
myopia_df

In [None]:
# Store the data for the patients' Myopic results in a variable before dropping from the DataFrame
myopic_column = myopia_df['MYOPIC']
# myopic_column

In [None]:
# Remove the "MYOPIC" column from the dataset
myopia_df.drop(columns='MYOPIC', inplace=True)
myopia_df.head()

In [None]:
# Standardize your dataset with StandarScaler so that columns that contain larger values do not influence the outcome more than columns with smaller values
myopia_scaled = StandardScaler().fit_transform(myopia_df)
# print(myopia_scaled[0:618])

### Part 2: Apply Dimensionality Reduction

In [None]:
"""Perform dimensionality reduction with PCA"""

# Initialize PCA model
pca = PCA(n_components=0.9)

# Get two principal components for the iris data.
myopia_pca = pca.fit_transform(myopia_scaled)
# print(myopia_pca[0:618])
myopia_pca.shape

<p>

- The number of features dropped from 14 columns down to 10 columnbs after performing the Principal Component Analysis

</p>

In [None]:
# Transform PCA data to a DataFrame
df_myopia_pca = pd.DataFrame(
    data=myopia_pca, columns=[
        "principal component 1",
        "principal component 2",
        "principal component 3",
        "principal component 4",
        "principal component 5",
        "principal component 6",
        "principal component 7",
        "principal component 8",
        "principal component 9",
        "principal component 10"
    ]
)
df_myopia_pca.head()

In [None]:
"""Further reduce the dataset dimensions with t-SNE and visually inspect the results"""

# Initialize t-SNE model
tsne = TSNE(learning_rate=35)

In [None]:
# Further reduce dimensions and inspect the results
tsne_features = tsne.fit_transform(df_myopia_pca)

tsne_features.shape

In [None]:
# Prepare to plot the dataset

# The first column of transformed features
# df_myopia_pca['x'] = tsne_features[:,0]

# The second column of transformed features
# df_myopia_pca['y'] = tsne_features[:,1]

# Visualize the clusters
plt.scatter(tsne_features[:,0], tsne_features[:,1])
plt.show()

In [None]:
# Visualize the clusters with color
plt.scatter(tsne_features[:,0], tsne_features[:,1], c=myopic_column)
plt.show()

<p>

- After reducing the dimensions down to 10 using PCA and using t-SNE to give each data point a location in a two-dimensional map there does not appear to be any distinct clusters based on our data.

</p>

### Part 3: Perform a Cluster Analysis with K-means

In [None]:
"""Create an elbow plot to identify the best number of clusters"""

inertia = []
k = [1,2,3,4,5,6,7,8,9,10]


# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_myopia_pca)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

plt.plot(df_elbow['k'], df_elbow['inertia'])
plt.xticks(range(1,11))
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Identify the best number of clusters using the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.head()

<p>It appears that the elbow of the plot is at k=3</p>

In [None]:
"""Predicting clusters with k=3"""

# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=0)

# Train the model
model.fit(df_myopia_pca)

# Predict clusters
predictions = model.predict(df_myopia_pca)
# print(predictions)

# Create return DataFrame with predicted clusters
df_myopia_pca["class"] = model.labels_

df_myopia_pca.head()

In [None]:
# Visualize the clusters
plt.scatter(df_myopia_pca['principal component 1'], df_myopia_pca['principal component 2'], c=df_myopia_pca['class'])
plt.xlabel('Principal component 1')
plt.ylabel('Principal component 2')
plt.title('Myopia Clusters')
plt.show()

In [None]:
# Plot the clusters for each Prinicpal Component visualized against the other Prinicpal Components
# for i in range(1, 11):
#     for j in range(i+1, 11):
#         if (i != j):
#             plt.scatter(df_myopia_pca[f'principal component {i}'], df_myopia_pca[f'principal component {j}'], c=df_myopia_pca['class'])
#             plt.xlabel(f'Principal component {i}')
#             plt.ylabel(f'Principal component {j}')
#             plt.title('Myopia Clusters')
#             plt.show()

### Part 4: Make a Recommendation

<p>

- Using Principal Component Analysis and K-Means we can see that there appears to be 3 distinct clusters. These clusters represent certain characteristics that are better at determining whether or not a patient has Myopia. This data could be very useful for doctors to know to better help determine if their patients have or could be at risk of developing Myopia (nearsightedness).

</p>