In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sns.set_theme(context="notebook", style="darkgrid")


# Introduction to Unsupervised Learning

## The KMeans Algorithm

In [None]:
from sklearn.datasets import make_blobs

X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=0)

tb_blob = pd.DataFrame(X)
tb_blob.columns = ["X1", "X2"]
sns.scatterplot(data=tb_blob, x="X1", y="X2")


### Creating Initial Positions

In [None]:
p1 = (np.quantile(tb_blob["X1"], 0.25), np.quantile(tb_blob["X2"], 0.25))
p2 = (np.quantile(tb_blob["X1"], 0.25), np.quantile(tb_blob["X2"], 0.75))
p3 = (np.quantile(tb_blob["X1"], 0.75), np.quantile(tb_blob["X2"], 0.25))
p4 = (np.quantile(tb_blob["X1"], 0.75), np.quantile(tb_blob["X2"], 0.75))

tb_centroids = pd.DataFrame(
    [p1, p2, p3, p4], columns=["X1", "X2"], index=["p1", "p2", "p3", "p4"]
)
tb_centroids


In [None]:
sns.scatterplot(data=tb_blob, x="X1", y="X2")
sns.scatterplot(data=tb_centroids, x="X1", y="X2", color="black", s=100)


### Updating Positions - Step 1

#### I - Calculate Cluster Assignments

In [None]:
for centroid in tb_centroids.iterrows():
    centroid_name = centroid[0]
    centroid_x1 = centroid[1]["X1"]
    centroid_x2 = centroid[1]["X2"]
    tb_blob[centroid_name] = (tb_blob["X1"] - centroid_x1) ** 2 + (
        tb_blob["X2"] - centroid_x2
    ) ** 2
tb_blob["cluster"] = tb_blob[["p1", "p2", "p3", "p4"]].idxmin(1)
tb_blob.head()


In [None]:
sns.scatterplot(data=tb_blob, x="X1", y="X2", hue="cluster")
sns.scatterplot(data=tb_centroids, x="X1", y="X2", color="black", s=100)


#### II - Recalculate Centroids

In [None]:
tb_centroids = tb_blob.groupby("cluster")[["X1", "X2"]].mean()
tb_centroids


In [None]:
sns.scatterplot(data=tb_blob, x="X1", y="X2", hue="cluster")
sns.scatterplot(data=tb_centroids, x="X1", y="X2", color="black", s=100)


### Updating Positions - Step 2

#### I - Calculate Cluster Assignments

In [None]:
for centroid in tb_centroids.iterrows():
    centroid_name = centroid[0]
    centroid_x1 = centroid[1]["X1"]
    centroid_x2 = centroid[1]["X2"]
    tb_blob[centroid_name] = (tb_blob["X1"] - centroid_x1) ** 2 + (
        tb_blob["X2"] - centroid_x2
    ) ** 2
tb_blob["cluster"] = tb_blob[["p1", "p2", "p3", "p4"]].idxmin(1)
tb_blob.head()


In [None]:
sns.scatterplot(data=tb_blob, x="X1", y="X2", hue="cluster")
sns.scatterplot(data=tb_centroids, x="X1", y="X2", color="black", s=100)


#### II - Recalculate Centroids

In [None]:
tb_centroids = tb_blob.groupby("cluster")[["X1", "X2"]].mean()
sns.scatterplot(data=tb_blob, x="X1", y="X2", hue="cluster")
sns.scatterplot(data=tb_centroids, x="X1", y="X2", color="black", s=100)


# Using SKLEARN KMeans

In [None]:
from sklearn.cluster import KMeans


In [None]:
from sklearn.datasets import make_blobs

X, y_true = make_blobs(n_samples=300, centers=4, cluster_std=0.6, random_state=0)

tb_blob = pd.DataFrame(X)
tb_blob.columns = ["X1", "X2"]
sns.scatterplot(data=tb_blob, x="X1", y="X2")


In [None]:
km_fit = KMeans(n_clusters=4)
km_fit.fit(tb_blob[["X1", "X2"]])


In [None]:
km_fit.cluster_centers_


In [None]:
tb_blob["cluster_n4"] = km_fit.predict(tb_blob[["X1", "X2"]])


In [None]:
sns.scatterplot(data=tb_blob, x="X1", y="X2", hue="cluster_n4", palette="Spectral")


# Analyzing Real-World Data

## 2-D Clustering using USA Census Data

In [None]:
tb_census = pd.read_csv("data/acs2017_census_tract_data.csv", index_col=0)
tb_census = tb_census.dropna()


In [None]:
sns.kdeplot(data=tb_census, x="White", y="Black")


In [None]:
km_fit_census = KMeans(n_clusters=3)
km_fit_census.fit(tb_census[["White", "Black"]])
tb_census["racial_cluster"] = km_fit_census.predict(tb_census[["White", "Black"]])


In [None]:
sns.kdeplot(
    data=tb_census,
    x="White",
    y="Black",
    hue="racial_cluster",
    fill=True,
    common_norm=False,
    palette="Spectral",
)


In [None]:
km_fit_census = KMeans(n_clusters=4)
km_fit_census.fit(tb_census[["White", "Black"]])
tb_census["racial_cluster"] = km_fit_census.predict(tb_census[["White", "Black"]])


In [None]:
sns.kdeplot(
    data=tb_census,
    x="White",
    y="Black",
    hue="racial_cluster",
    fill=True,
    common_norm=False,
    palette="Spectral",
)


## Clustering on n-Dimensions

In [None]:
cluster_vars = [
    "IncomePerCapErr",
    "Poverty",
    "Professional",
    "Service",
    "Office",
    "Construction",
    "Production",
    "OtherTransp",
    "WorkAtHome",
    "MeanCommute",
    "Employed",
    "PrivateWork",
    "PublicWork",
    "SelfEmployed",
]


In [None]:
census_scaler = StandardScaler()
census_scaler.fit(tb_census[cluster_vars])

tb_sca_census = pd.DataFrame(
    census_scaler.transform(tb_census[cluster_vars]), columns=cluster_vars
)
tb_sca_census.head()


In [None]:
census_pca = PCA(n_components= 4)
census_pca.fit(tb_sca_census)

tb_pca_census = pd.DataFrame(
    census_pca.transform(tb_sca_census),
    columns=["PC" + str(i) for i in range(census_pca.n_components_)],
)


In [None]:
sns.pairplot(data=tb_pca_census.sample(1000), kind = 'kde')


In [None]:
km_fit_census = KMeans(n_clusters=4)
km_fit_census.fit(tb_sca_census)
tb_census["work_cluster"] = km_fit_census.predict(tb_sca_census[cluster_vars])
tb_pca_census["work_cluster"] = km_fit_census.predict(tb_sca_census[cluster_vars])


In [None]:
sns.pairplot(data=tb_pca_census.sample(1000), hue="work_cluster", kind="kde")


#### Analyzing Cluster Results

In [None]:
tb_census.columns

In [None]:
fig, ax = plt.subplots(2,2, figsize = (10, 10))
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'IncomePerCap', ax = ax[0][0])
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'Poverty', ax = ax[0][1])
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'Unemployment', ax = ax[1][0])
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'WorkAtHome', ax = ax[1][1])

In [None]:
fig, ax = plt.subplots(2,2, figsize = (10, 10))
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'White', ax = ax[0][0])
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'Black', ax = ax[0][1])
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'Hispanic', ax = ax[1][0])
sns.boxplot(data = tb_census, x = 'work_cluster', y = 'Asian', ax = ax[1][1])