In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

data_path = ("http://archive.ics.uci.edu/ml/machine-learning-databases/"
             "heart-disease/processed.cleveland.data")
df = pd.read_csv(data_path)

# Make sure the number of rows divides evenly into four samples.
rows = df.shape[0] - df.shape[0] % 4
df = df.iloc[:rows, :]

# Break into a set of features and a variable for the known outcome.
X = df.iloc[:, :13]
y = df.iloc[:, 13]

# Replace some random string values.
X = X.replace(to_replace='?', value=0)

# Binarize y so that 1 means heart disease diagnosis and 0 means no diagnosis.
y = np.where(y > 0, 0, 1)

# Normalize
X_norm = normalize(X)

# Data frame to store features and predicted cluster memberships.
ypred = pd.DataFrame()

# Create the two-feature PCA for graphing purposes.
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_norm)

# Split the data into four equally-sized samples. First we break it in half:
X_half1, X_half2, X_pcahalf1, X_pcahalf2 = train_test_split(
    X_norm,
    X_pca,
    test_size=0.5,
    random_state=42)

# Then we halve the halves.
X1, X2, X_pca1, X_pca2 = train_test_split(
    X_half1,
    X_pcahalf1,
    test_size=0.5,
    random_state=42)
X3, X4, X_pca3, X_pca4 = train_test_split(
    X_half2,
    X_pcahalf2,
    test_size=0.5,
    random_state=42)

# Pass a list of tuples and a counter that increments each time we go
# through the loop. The tuples are the data to be used by k-means,
# and the PCA-derived features for graphing. We use k-means to fit a
# model to the data, then store the predicted values and the two-feature
# PCA solution in the data frame.
for counter, data in enumerate([
    (X1, X_pca1),
    (X2, X_pca2),
    (X3, X_pca3),
    (X4, X_pca4)]):
    
    # Put the features into ypred.
    ypred['pca_f1' + '_sample' + str(counter)] = data[1][:, 0]
    ypred['pca_f2' + '_sample' + str(counter)] = data[1][:, 1]
    
    # Generate cluster predictions and store them for clusters 2 to 4.
    for nclust in range(2, 5):
        pred = KMeans(n_clusters=nclust, random_state=42).fit_predict(data[0])
        ypred['clust' + str(nclust) + '_sample' + str(counter)] = pred

In [8]:
ypred

Unnamed: 0,pca_f1_sample0,pca_f2_sample0,clust2_sample0,clust3_sample0,clust4_sample0,pca_f1_sample1,pca_f2_sample1,clust2_sample1,clust3_sample1,clust4_sample1,pca_f1_sample2,pca_f2_sample2,clust2_sample2,clust3_sample2,clust4_sample2,pca_f1_sample3,pca_f2_sample3,clust2_sample3,clust3_sample3,clust4_sample3
0,-0.026304,-0.077226,0,0,0,-0.144763,0.076085,1,1,3,0.037316,-0.064633,0,0,3,-0.027312,0.068027,0,1,3
1,0.035756,0.106037,1,1,3,-0.044465,0.104562,1,2,3,0.020933,0.032752,0,0,0,-0.133810,-0.101523,1,2,2
2,-0.109067,-0.010568,0,0,1,0.072059,-0.015653,0,0,1,-0.054679,-0.003054,1,1,1,0.010307,-0.054540,0,0,0
3,-0.156288,-0.021238,0,0,1,-0.027281,-0.021637,1,2,2,0.004031,0.005068,0,0,0,0.071970,0.021477,0,0,1
4,-0.105195,-0.058385,0,0,1,-0.032298,-0.008465,1,2,2,0.141764,-0.113243,0,0,3,-0.006085,0.098228,0,1,3
5,0.125744,-0.068847,1,2,2,-0.130621,0.117722,1,1,3,0.061804,-0.065955,0,0,3,0.116514,-0.011595,0,0,1
6,-0.023835,-0.104382,0,0,0,-0.168632,0.004150,1,1,0,0.159749,0.063437,0,2,2,0.040499,0.092517,0,1,3
7,-0.017729,-0.043651,0,0,0,0.125544,-0.026501,0,0,1,-0.028240,-0.017344,1,0,0,-0.014216,-0.068456,1,2,0
8,-0.023028,0.130651,0,1,3,-0.063055,-0.065452,1,2,2,0.019842,-0.070600,0,0,3,0.160142,-0.055765,0,0,1
9,-0.062988,-0.071537,0,0,0,0.146824,0.007973,0,0,1,0.084728,-0.037948,0,0,3,0.031248,-0.020142,0,0,0


In [9]:
X_pca1

array([[-0.02630367, -0.07722632],
       [ 0.0357564 ,  0.10603717],
       [-0.10906747, -0.01056768],
       [-0.1562876 , -0.02123791],
       [-0.10519491, -0.05838464],
       [ 0.12574438, -0.06884673],
       [-0.02383501, -0.10438178],
       [-0.01772935, -0.04365055],
       [-0.02302826,  0.13065094],
       [-0.06298767, -0.0715373 ],
       [ 0.17499704, -0.07528085],
       [-0.02989909, -0.01327267],
       [ 0.15061855,  0.04868408],
       [-0.1406802 ,  0.02022343],
       [-0.01427172, -0.03081585],
       [-0.14072786,  0.06703007],
       [-0.14391406,  0.03915615],
       [-0.06177068, -0.01011778],
       [-0.025363  , -0.00193311],
       [ 0.00902662,  0.07985766],
       [-0.05148551,  0.04293145],
       [-0.07643546,  0.090986  ],
       [ 0.06364956,  0.05409371],
       [-0.00197991,  0.13004203],
       [ 0.06715538,  0.03744643],
       [-0.07187599, -0.03528222],
       [ 0.10847561, -0.06877486],
       [ 0.10820223, -0.08225369],
       [ 0.00909905,