In [67]:
import pandas as pd
import numpy as np

from typing import Tuple, List, Dict, Any, Optional

In [182]:
data = pd.read_csv('data/nursery.csv', skiprows=[1, 2], header=0)
data

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
...,...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior


In [183]:
data['class'].value_counts()

class
not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: count, dtype: int64

In [115]:
def select_initial_centroids(D: pd.DataFrame, k: int, random: bool = False):
    if random:
        centroid_list = D.sample(n=k, replace=False).values.tolist()
        return {i: centroid_list[i] for i in range(k)}

    else:
        centroid_list = {0: D.mean().tolist()}
        for i in range(k):
            c = D.apply(lambda x: dist(x, centroid_list).sum(), axis=1).idxmax()
            centroid_list[i] = D.iloc[c].tolist()
            D = D.drop(c, axis=0)

        return centroid_list

In [116]:
print(select_initial_centroids(dummify(data), 10))

{0: [54.0, 130.0, 603.0, 1.0, 125.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0], 1: [52.0, 122.0, 0.0, 0.0, 110.0, 2.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], 2: [50.0, 160.0, 0.0, 1.0, 110.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 3: [74.0, 120.0, 269.0, 0.0, 121.0, 0.2, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0], 4: [32.0, 118.0, 529.0, 0.0, 130.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 5: [60.0, 125.0, 0.0, 1.0, 110.0, 0.1, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0], 6: [53.0, 145.0, 518.0, 0.0, 130.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 7: [72.0, 160.0, 0.0, 0.0, 114.0, 1.6, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0], 8: [62.0, 140.0, 271.0, 0.0, 152.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 

In [113]:
def dist(x: pd.Series, m: Dict[int, List[Any]]) -> pd.Series:
    x_cat = [cat for cat in x if not np.isreal(cat)]
    x_num = [float(num) for num in x if np.isreal(num)]

    n_cat, prop_cat = len(x_cat), len(x_cat) / len(x)
    prop_num = len(x_num) / len(x)

    #m_cat = {i: [cat for cat in m[i] if not np.isreal(cat)] for i in m.keys()}
    m_num = {i: [num for num in m[i] if np.isreal(num)] for i in m.keys()}

    #dists_cat = {i: (pd.Series(x_cat) == pd.Series(m_cat[i])).sum() / n_cat for i in m_cat.keys()}
    dists_num = {i: np.sqrt(((pd.Series(x_num) - pd.Series(m_num[i]))**2).sum()) for i in m_num.keys()}

    #dists = prop_cat * pd.Series(dists_cat) + prop_num * pd.Series(dists_num)

    #return dists
    return pd.Series(dists_num)

In [114]:
def dist(x: pd.Series, m: Dict[int, List[Any]]) -> pd.Series:
    x_num = x.tolist()
    dists = {i: np.sqrt(((pd.Series(x_num) - pd.Series(m[i]))**2).sum()) for i in m.keys()}
    return pd.Series(dists)

In [72]:
dist(data.iloc[1], select_initial_centroids(data, 3))

0     18.890213
1    116.684357
2     67.914379
dtype: float64

In [21]:
np.isreal(pd.DataFrame({1: ['1', '2', '3', '4']}).iloc[0, 0])

False

In [74]:
def dummify(df):
    df_dummy = df.copy()
    cat_cols = [col for col in df.columns if pd.api.types.is_object_dtype(df[col])]

    for col in cat_cols:
        cats = set(df[col])
        for cat in cats:
            df_dummy[col + '_' + str(cat)] = (df[col] == cat) * 1
        df_dummy.drop(col, axis=1, inplace=True)

    return df_dummy

In [117]:
def standardize(df):
    df_std = df.copy()
    num_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]

    for col in num_cols:
        df_std[col] = (df_std[col] - df_std[col].mean()) / df_std[col].std()

    return df_std

In [135]:
dummify(data)

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_F,Sex_M,ChestPainType_NAP,...,ChestPainType_ATA,ChestPainType_ASY,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,1,...,0,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,...,1,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,0,...,0,1,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,1,...,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,1,0,...,0,0,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,1,0,1,0,...,0,1,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,0,1,0,...,0,1,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,1,0,0,...,1,0,1,0,0,1,0,0,1,0


In [155]:
def is_stopping_condition(m, m_new, cl, cl_new) -> bool:
    if cl is None:
        return False

    stop1 = all([set(cl[key]) == set(cl_new[key]) for key in cl.keys()])
    stop2 = all([np.allclose(m[key], m_new[key]) for key in m.keys()])

    return stop1 or stop2

In [176]:
def knn(D: pd.DataFrame, k: int, std: bool = False, classvar: Optional[str] = None):
    D_orig = D.iloc[:,:]
    if classvar is not None:
        D = D.drop(classvar, axis=1)
    if std:
        D = standardize(D)
    D = dummify(D)
    m = select_initial_centroids(D, k)
    s, num, cl = {}, {}, None
    cl_new, m_new = {}, {}

    stop = False
    while not stop:
        for j in range(k):
            s[j] = pd.Series([0] * len(D.columns))
            s[j].index = D.columns
            num[j] = 0
            cl_new[j] = []
        for i in range(len(D.index)):
            x = D.iloc[i]
            cluster = dist(x, m).idxmin()
            cl_new[cluster].append(i)
            #TODO: Ask how the fuck this is supposed to work with categorical (below)
            s[cluster] = s[cluster] + x
            num[cluster] += 1
        for j in range(k):
            m_new[j] = (s[j] / num[j]).tolist()
        stop = is_stopping_condition(m, m_new, cl, cl_new)
        cl, m = cl_new.copy(), m_new.copy()
    final_cl = []
    for key in cl.keys():
        cl_i = pd.Series([key]*len(cl[key]))
        cl_i.index = cl[key]
        final_cl.append(cl_i)
    final_cl = pd.concat(final_cl).sort_index()
    D_orig['cluster'] = final_cl
    return D_orig

In [184]:
results = knn(data, 5, std=True, classvar='class')
results

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,class,cluster
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend,0
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority,0
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom,0
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend,0
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority,0
...,...,...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior,2
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom,2
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior,2
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior,2


In [181]:
pd.crosstab(results['HeartDisease'], results['cluster'])

cluster,0,1
HeartDisease,Unnamed: 1_level_1,Unnamed: 2_level_1
0,338,72
1,100,408
