# Clustering

Using clustering, we may be able to find particular clusters of patients that have a higher amount of Alzheimer's disease than others. Then, based on the differences between clusters, we might be able to find a pattern.

In [None]:
import data_reader
import numpy as np
from scipy.stats import mode

# Select k datapoints at random using the PatientID
data = data_reader.get_data_dict('./data/alzheimers_disease_data.csv')
columns = list(data.keys())

# Create a matrix for numerical and for categorical data
num_cols = ['BMI', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL',
    'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
    'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
    'ADL']
cat_cols = ['FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes',
      'Depression', 'Hypertension', 'MemoryComplaints', 'BehavioralProblems',
      'Confusion', 'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
      'Forgetfulness', 'HeadInjury', 'Smoking', 'Ethnicity', 'Gender',
      'EducationLevel']

num_matrix = np.vstack(tuple(data[col] for col in num_cols)).T
cat_matrix = np.vstack(tuple(data[col] for col in cat_cols)).T

# Set k to the desired amount of clusters
k = 5
prototypes = np.random.randint(0, len(num_matrix), k)
print(f'Selected initial prototypes: {prototypes}')

# From now on, we keep track of prototypes in the following way. Only the first
# prototypes will be actual datapoints, while the later prototypes will simply
# contain the means and modes of the data in a cluster.
prototypes = [(num_matrix[i], cat_matrix[i]) for i in prototypes]
clusters = []
for _ in range(k):
    clusters.append([])


def dissimilarity_score(v1, v2):
    # Calculate the amount of differences between categorical features
    differences = v1 != v2
    return np.sum(differences)


def euclidian_distance(v1, v2):
    # Calculate the euclidian distance between numerical features
    return np.sqrt(np.sum((v1 - v2)**2))


def assign_cluster(prototypes, clusters, point):
    dissimilarities = []
    for pt in prototypes:
        v_n = num_matrix[point]
        v_c = cat_matrix[point]
        pt_v_n = pt[0]
        pt_v_c = pt[1]
        dissimilarity = dissimilarity_score(pt_v_n, v_n) + euclidian_distance(pt_v_c, v_c)
        dissimilarities.append(dissimilarity)

    dissimilarities = np.array(dissimilarities)
    cluster = np.where(dissimilarities == dissimilarities.min())[0][0]
    clusters[cluster].append((num_matrix[point], cat_matrix[point]))

    return clusters


def calc_prototype(cluster):
    # Calculate a new prototype
    num_cluster = np.vstack([point[0] for point in cluster])
    num_prototype = np.mean(num_cluster, axis=0)
    cat_cluster = np.vstack([point[1] for point in cluster])
    cat_prototype = mode(cat_cluster, axis=0).mode

    return (num_prototype, cat_prototype)


# For each datapoint, calculate its dissimilarity with each prototype and assign
# it to the cluster that corresponds to the prototype with the smallest
# dissimilarity
while True:
    for i in range(len(num_matrix)):
        clusters = assign_cluster(prototypes, clusters, i)
    # calculate new prototypes
    new_prototypes = []
    for cluster in clusters:
        new_prototypes.append(calc_prototype(cluster))

    done = True
    for i in range(len(prototypes)):
        if not np.array_equal(prototypes[i][0], new_prototypes[i][0]) or \
            not np.array_equal(prototypes[i][1], new_prototypes[i][1]):
            done = False
            break
    if done:
        break

    prototypes = new_prototypes

    clusters = []
    for i in range(k):
        clusters.append([])


print(f'---  {k} clusters found  ---')
for i in range(len(clusters)):
    print(f' - Cluster {i + 1} with size={len(clusters[i])}')



Selected initial prototypes: [1233 1862  564  962  485]
---  5 clusters found  ---
 - Cluster 1 with size=232
 - Cluster 2 with size=351
 - Cluster 3 with size=185
 - Cluster 4 with size=962
 - Cluster 5 with size=419
