In [1]:
import pandas_ml as pdml

import pandas as pd

In [2]:
df = pd.read_csv('datasets/raw_anonymized_data.csv')

In [3]:
mf = pdml.ModelFrame(df.to_dict())

mf.head()

Unnamed: 0,ID,cancer,diabetes,heart_disease,belly,ever_smoked,currently_smoke,smoke_often,smoke_rarely,never_smoked,...,DT_FIBER_INSOL,DT_FIBER_SOL,DT_PROT_ANIMAL,DT_PROT_VEGETABLE,DT_NITROGEN,PHYTIC_ACID,OXALIC_ACID,COUMESTROL,BIOCHANIN_A,FORMONONETIN
0,1003,Yes,No,No,Innie,Yes,Yes,Yes,No,No,...,7.38,1.25,75.46,16.0,14.89,365.7,318.11,0.0117,0.0658,0.00324
1,1053,No,Yes,Yes,Outie,Yes,Yes,No,Yes,No,...,9.11,3.37,59.41,18.25,12.51,434.98,112.66,0.0107,0.139,0.00743
2,1006,Yes,Yes,Yes,Innie,No,No,No,No,Yes,...,11.56,4.74,61.49,28.46,14.45,606.43,213.41,0.0965,0.0519,0.00946
3,1166,No,No,No,Innie,No,No,No,No,Yes,...,26.34,10.85,28.71,44.59,12.15,1570.07,334.08,0.283,0.089,0.0126
4,1134,Yes,No,No,Innie,No,No,No,No,Yes,...,16.48,4.8,32.41,28.23,9.8,616.99,422.55,0.163,0.0994,0.0207


In [4]:
mf.shape

(54, 1093)

In [5]:
mf = mf[['ID', 'cancer', 'diabetes', 
         'heart_disease', 'ever_smoked']]

mf.head()

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked
0,1003,Yes,No,No,Yes
1,1053,No,Yes,Yes,Yes
2,1006,Yes,Yes,Yes,No
3,1166,No,No,No,No
4,1134,Yes,No,No,No


In [6]:
le = mf.preprocessing.LabelEncoder()

In [7]:
mf['cancer'] = le.fit_transform(mf['cancer'])
                                
mf.head()

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked
0,1003,1,No,No,Yes
1,1053,0,Yes,Yes,Yes
2,1006,1,Yes,Yes,No
3,1166,0,No,No,No
4,1134,1,No,No,No


In [8]:
mf['diabetes'] = le.fit_transform(mf['diabetes'])
mf['heart_disease'] = le.fit_transform(mf['heart_disease'])
mf['ever_smoked'] = le.fit_transform(mf['ever_smoked'])

mf.head()

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked
0,1003,1,0,0,1
1,1053,0,1,1,1
2,1006,1,1,1,0
3,1166,0,0,0,0
4,1134,1,0,0,0


In [9]:
mf.describe()

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked
count,54.0,54.0,54.0,54.0,54.0
mean,1082.944444,0.518519,0.277778,0.37037,0.296296
std,54.289725,0.504349,0.452109,0.487438,0.460911
min,1001.0,0.0,0.0,0.0,0.0
25%,1043.25,0.0,0.0,0.0,0.0
50%,1075.5,1.0,0.0,0.0,0.0
75%,1126.75,1.0,1.0,1.0,1.0
max,1192.0,1.0,1.0,1.0,1.0


In [10]:
estimator = mf.cluster.KMeans(n_clusters = 4)

estimator

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [11]:
mf.fit(estimator)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [12]:
clusters = mf.predict(estimator)

In [13]:
clusters.head()

0    0
1    2
2    0
3    3
4    1
dtype: int32

In [14]:
clusters.name = 'cluster_id'

In [15]:
clustered_mf = pd.concat([mf, clusters], 
                         axis = 1)

clustered_mf.head(10)

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked,cluster_id
0,1003,1,0,0,1,0
1,1053,0,1,1,1,2
2,1006,1,1,1,0,0
3,1166,0,0,0,0,3
4,1134,1,0,0,0,1
5,1014,0,0,0,1,0
6,1074,1,0,0,0,2
7,1151,1,0,1,0,3
8,1001,1,1,1,1,0
9,1048,1,0,0,0,2


In [16]:
mf.describe()

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked
count,54.0,54.0,54.0,54.0,54.0
mean,1082.944444,0.518519,0.277778,0.37037,0.296296
std,54.289725,0.504349,0.452109,0.487438,0.460911
min,1001.0,0.0,0.0,0.0,0.0
25%,1043.25,0.0,0.0,0.0,0.0
50%,1075.5,1.0,0.0,0.0,0.0
75%,1126.75,1.0,1.0,1.0,1.0
max,1192.0,1.0,1.0,1.0,1.0


In [17]:
cluster_means = clustered_mf.groupby(['cluster_id']).mean()

cluster_means

Unnamed: 0_level_0,ID,cancer,diabetes,heart_disease,ever_smoked
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1014.769231,0.692308,0.384615,0.461538,0.384615
1,1116.153846,0.384615,0.076923,0.307692,0.230769
2,1063.5,0.555556,0.333333,0.388889,0.333333
3,1163.4,0.4,0.3,0.3,0.2


In [18]:
cluster_means['counts'] = pd.Series(clustered_mf\
                                    .groupby(['cluster_id'])\
                                    .size())

cluster_means

Unnamed: 0_level_0,ID,cancer,diabetes,heart_disease,ever_smoked,counts
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1014.769231,0.692308,0.384615,0.461538,0.384615,13
1,1116.153846,0.384615,0.076923,0.307692,0.230769,13
2,1063.5,0.555556,0.333333,0.388889,0.333333,18
3,1163.4,0.4,0.3,0.3,0.2,10


In [19]:
clustered_mf[clustered_mf['cluster_id'] == 2]

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked,cluster_id
1,1053,0,1,1,1,2
6,1074,1,0,0,0,2
9,1048,1,0,0,0,2
10,1073,1,0,1,1,2
11,1075,0,1,0,0,2
12,1051,1,0,0,0,2
18,1081,1,1,0,1,2
20,1071,1,1,1,0,2
21,1063,0,0,0,1,2
24,1058,0,1,0,0,2


In [20]:
estimator = mf.cluster.KMeans(n_clusters = 2)

mf.fit(estimator)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
clusters = mf.predict(estimator)

clusters.head()

0    1
1    1
2    1
3    0
4    0
dtype: int32

In [22]:
clusters.name = 'cluster_id'

clustered_mf = pd.concat([mf, clusters], 
                         axis = 1)

clustered_mf.head(10)

Unnamed: 0,ID,cancer,diabetes,heart_disease,ever_smoked,cluster_id
0,1003,1,0,0,1,1
1,1053,0,1,1,1,1
2,1006,1,1,1,0,1
3,1166,0,0,0,0,0
4,1134,1,0,0,0,0
5,1014,0,0,0,1,1
6,1074,1,0,0,0,1
7,1151,1,0,1,0,0
8,1001,1,1,1,1,1
9,1048,1,0,0,0,1


In [23]:
cluster_means = clustered_mf.groupby(['cluster_id']).mean()

cluster_means

Unnamed: 0_level_0,ID,cancer,diabetes,heart_disease,ever_smoked
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1136.695652,0.391304,0.173913,0.304348,0.217391
1,1043.064516,0.612903,0.354839,0.419355,0.354839


In [24]:
cluster_means['counts'] = pd.Series(clustered_mf\
                                    .groupby(['cluster_id'])\
                                    .size())

cluster_means

Unnamed: 0_level_0,ID,cancer,diabetes,heart_disease,ever_smoked,counts
cluster_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1136.695652,0.391304,0.173913,0.304348,0.217391,23
1,1043.064516,0.612903,0.354839,0.419355,0.354839,31
