In [28]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

In [27]:
pip install kmodes

Defaulting to user installation because normal site-packages is not writeable
Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.12.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)

In [3]:
# Importing Dataset
file_path = 'Dataset/adult.data'
column_names = [
    'Age', 'Workclass', 'FinalWeight', 'Education', 'EducationNum', 'MaritalStatus',
    'Occupation', 'Relationship', 'Race', 'Gender', 'CapitalGain', 'CapitalLoss',
    'HoursPerWeek', 'NativeCountry', 'Income'
]

df = pd.read_csv(file_path, header=None, names=column_names)
df

Unnamed: 0,Age,Workclass,FinalWeight,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


### Dataset Preparation

In [4]:
# Checking for missing values
df.isna().sum()

Age              0
Workclass        0
FinalWeight      0
Education        0
EducationNum     0
MaritalStatus    0
Occupation       0
Relationship     0
Race             0
Gender           0
CapitalGain      0
CapitalLoss      0
HoursPerWeek     0
NativeCountry    0
Income           0
dtype: int64

In [5]:
df.dtypes

Age               int64
Workclass        object
FinalWeight       int64
Education        object
EducationNum      int64
MaritalStatus    object
Occupation       object
Relationship     object
Race             object
Gender           object
CapitalGain       int64
CapitalLoss       int64
HoursPerWeek      int64
NativeCountry    object
Income           object
dtype: object

Dealing With Nominal Values

In [6]:
df['Workclass'].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: Workclass, dtype: int64

In [7]:
# Droppping rows with '?' in the 'Workclass' column
filt = df['Workclass'].str.strip() != '?'
df = df[filt]
df

Unnamed: 0,Age,Workclass,FinalWeight,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [8]:
# Applying one-hot encoding to the 'Workclass' column
df_encoded = pd.get_dummies(df['Workclass'], prefix='Workclass')

# Concatenating the encoded columns with the original DataFrame
df = pd.concat([df, df_encoded], axis=1)

# Dropping the original 'MaritalStatus' column
df = df.drop('Workclass', axis=1)

df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay
0,39,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0,0,0,0,0,1,0
1,50,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,0,0,0,0,1,0,0
2,38,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0
3,53,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0
4,28,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0,0,0,1,0,0,0,0
32557,40,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0,0,0,1,0,0,0,0
32558,58,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0
32559,22,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,0,0,0,1,0,0,0,0


In [9]:
# Function to strip whitespaces from all values in the DataFrame
def strip_whitespaces(value):
    if isinstance(value, str):
        return value.strip()
    return value

df = df.applymap(strip_whitespaces)

In [10]:
df['Education'].value_counts()

HS-grad         9969
Some-college    6777
Bachelors       5182
Masters         1675
Assoc-voc       1321
11th            1057
Assoc-acdm      1020
10th             833
7th-8th          574
Prof-school      558
9th              463
Doctorate        398
12th             393
5th-6th          303
1st-4th          156
Preschool         46
Name: Education, dtype: int64

In [11]:
# Converting ordinal data into numerical data
replacement = {
    'Preschool': 0,
    '1st-4th': 1,
    '5th-6th': 2,
    '7th-8th': 3,
    '9th': 4,
    '10th': 5,
    '11th': 6,        
    '12th': 7,        
    'HS-grad': 8,
    'Some-college': 9,
    'Assoc-acdm': 10,
    'Assoc-voc': 11,
    'Bachelors': 12,
    'Masters': 13,
    'Doctorate': 14,
    'Prof-school': 15,
}

df['Education'] = df['Education'].map(replacement)
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,MaritalStatus,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay
0,39,77516,12,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0,0,0,0,0,1,0
1,50,83311,12,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,0,0,0,0,1,0,0
2,38,215646,8,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0
3,53,234721,6,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0
4,28,338409,12,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0,0,0,1,0,0,0,0
32557,40,154374,8,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0,0,0,1,0,0,0,0
32558,58,151910,8,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0
32559,22,201490,8,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,0,0,0,1,0,0,0,0


In [12]:
df['MaritalStatus'].value_counts()

Married-civ-spouse       14340
Never-married             9917
Divorced                  4259
Separated                  959
Widowed                    840
Married-spouse-absent      389
Married-AF-spouse           21
Name: MaritalStatus, dtype: int64

In [13]:
# Applying one-hot encoding to the 'MaritalStatus' column
df_encoded = pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus')
df = pd.concat([df, df_encoded], axis=1)
df = df.drop('MaritalStatus', axis=1)

df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed
0,39,77516,12,13,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,50,83311,12,13,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,38,215646,8,9,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
3,53,234721,6,7,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,28,338409,12,13,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
32557,40,154374,8,9,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
32558,58,151910,8,9,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
32559,22,201490,8,9,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [14]:
df['Occupation'].value_counts()

Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
?                       7
Name: Occupation, dtype: int64

In [15]:
filt = df['Occupation'].str.strip() != '?'
df = df[filt]
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,Occupation,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed
0,39,77516,12,13,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,50,83311,12,13,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
2,38,215646,8,9,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
3,53,234721,6,7,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,28,338409,12,13,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
32557,40,154374,8,9,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
32558,58,151910,8,9,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
32559,22,201490,8,9,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [16]:
# Applying one-hot encoding to the 'Relationship' column
df_encoded = pd.get_dummies(df['Occupation'], prefix='Occupation')
df = pd.concat([df, df_encoded], axis=1)
df = df.drop('Occupation', axis=1)
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,Relationship,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed,Occupation_Adm-clerical,Occupation_Armed-Forces,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving
0,39,77516,12,13,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,50,83311,12,13,Husband,White,Male,0,0,13,United-States,<=50K,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,38,215646,8,9,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,53,234721,6,7,Husband,Black,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4,28,338409,12,13,Wife,Black,Female,0,0,40,Cuba,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,Wife,White,Female,0,0,38,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
32557,40,154374,8,9,Husband,White,Male,0,0,40,United-States,>50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
32558,58,151910,8,9,Unmarried,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
32559,22,201490,8,9,Own-child,White,Male,0,0,20,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
df['Relationship'].value_counts()

Husband           12704
Not-in-family      7865
Own-child          4525
Unmarried          3271
Wife               1435
Other-relative      918
Name: Relationship, dtype: int64

In [18]:
# Applying one-hot encoding to the 'Relationship' column
df_encoded = pd.get_dummies(df['Relationship'], prefix='Relationship')
df = pd.concat([df, df_encoded], axis=1)
df = df.drop('Relationship', axis=1)
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed,Occupation_Adm-clerical,Occupation_Armed-Forces,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving,Relationship_Husband,Relationship_Not-in-family,Relationship_Other-relative,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife
0,39,77516,12,13,White,Male,2174,0,40,United-States,<=50K,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,50,83311,12,13,White,Male,0,0,13,United-States,<=50K,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,38,215646,8,9,White,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,53,234721,6,7,Black,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,28,338409,12,13,Black,Female,0,0,40,Cuba,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,White,Female,0,0,38,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
32557,40,154374,8,9,White,Male,0,0,40,United-States,>50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
32558,58,151910,8,9,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
32559,22,201490,8,9,White,Male,0,0,20,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [19]:
df['Race'].value_counts()

White                 26301
Black                  2909
Asian-Pac-Islander      974
Amer-Indian-Eskimo      286
Other                   248
Name: Race, dtype: int64

In [20]:
df['Gender'].value_counts()

Male      20788
Female     9930
Name: Gender, dtype: int64

In [21]:
df['NativeCountry'].value_counts()

United-States                 27504
Mexico                          610
?                               556
Philippines                     188
Germany                         128
Puerto-Rico                     109
Canada                          107
India                           100
El-Salvador                     100
Cuba                             92
England                          86
Jamaica                          80
South                            71
China                            68
Italy                            68
Dominican-Republic               67
Vietnam                          64
Guatemala                        63
Japan                            59
Poland                           56
Columbia                         56
Iran                             42
Taiwan                           42
Haiti                            42
Portugal                         34
Nicaragua                        33
Peru                             30
Greece                      

In [22]:
filt = df['NativeCountry'].str.strip() != '?'
df = df[filt]
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,Race,Gender,CapitalGain,CapitalLoss,HoursPerWeek,NativeCountry,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed,Occupation_Adm-clerical,Occupation_Armed-Forces,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving,Relationship_Husband,Relationship_Not-in-family,Relationship_Other-relative,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife
0,39,77516,12,13,White,Male,2174,0,40,United-States,<=50K,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,50,83311,12,13,White,Male,0,0,13,United-States,<=50K,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,38,215646,8,9,White,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,53,234721,6,7,Black,Male,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,28,338409,12,13,Black,Female,0,0,40,Cuba,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,White,Female,0,0,38,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
32557,40,154374,8,9,White,Male,0,0,40,United-States,>50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
32558,58,151910,8,9,White,Female,0,0,40,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
32559,22,201490,8,9,White,Male,0,0,20,United-States,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [23]:
# Applying one-hot encoding to the 'Race', 'Gender' and 'NativeCountry' columns
df_encoded = pd.get_dummies(df[['Race','Gender', 'NativeCountry']], columns=['Race','Gender', 'NativeCountry'], prefix=['Race','Gender', 'NativeCountry'])
df = pd.concat([df, df_encoded], axis=1)
df = df.drop(['Race','Gender', 'NativeCountry'], axis=1)
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed,Occupation_Adm-clerical,Occupation_Armed-Forces,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving,Relationship_Husband,Relationship_Not-in-family,Relationship_Other-relative,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife,Race_Amer-Indian-Eskimo,Race_Asian-Pac-Islander,Race_Black,Race_Other,Race_White,Gender_Female,Gender_Male,NativeCountry_Cambodia,NativeCountry_Canada,NativeCountry_China,NativeCountry_Columbia,NativeCountry_Cuba,NativeCountry_Dominican-Republic,NativeCountry_Ecuador,NativeCountry_El-Salvador,NativeCountry_England,NativeCountry_France,NativeCountry_Germany,NativeCountry_Greece,NativeCountry_Guatemala,NativeCountry_Haiti,NativeCountry_Holand-Netherlands,NativeCountry_Honduras,NativeCountry_Hong,NativeCountry_Hungary,NativeCountry_India,NativeCountry_Iran,NativeCountry_Ireland,NativeCountry_Italy,NativeCountry_Jamaica,NativeCountry_Japan,NativeCountry_Laos,NativeCountry_Mexico,NativeCountry_Nicaragua,NativeCountry_Outlying-US(Guam-USVI-etc),NativeCountry_Peru,NativeCountry_Philippines,NativeCountry_Poland,NativeCountry_Portugal,NativeCountry_Puerto-Rico,NativeCountry_Scotland,NativeCountry_South,NativeCountry_Taiwan,NativeCountry_Thailand,NativeCountry_Trinadad&Tobago,NativeCountry_United-States,NativeCountry_Vietnam,NativeCountry_Yugoslavia
0,39,77516,12,13,2174,0,40,<=50K,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,50,83311,12,13,0,0,13,<=50K,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,38,215646,8,9,0,0,40,<=50K,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,53,234721,6,7,0,0,40,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,28,338409,12,13,0,0,40,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,0,0,38,<=50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
32557,40,154374,8,9,0,0,40,>50K,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
32558,58,151910,8,9,0,0,40,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
32559,22,201490,8,9,0,0,20,<=50K,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [24]:
df['Income'].value_counts()

<=50K    22654
>50K      7508
Name: Income, dtype: int64

In [25]:
# Assuimg income as ordinal data
replacement = {
    '<=50K': 1,
    '>50K': 2
}

df['Income'] = df['Income'].map(replacement)
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed,Occupation_Adm-clerical,Occupation_Armed-Forces,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving,Relationship_Husband,Relationship_Not-in-family,Relationship_Other-relative,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife,Race_Amer-Indian-Eskimo,Race_Asian-Pac-Islander,Race_Black,Race_Other,Race_White,Gender_Female,Gender_Male,NativeCountry_Cambodia,NativeCountry_Canada,NativeCountry_China,NativeCountry_Columbia,NativeCountry_Cuba,NativeCountry_Dominican-Republic,NativeCountry_Ecuador,NativeCountry_El-Salvador,NativeCountry_England,NativeCountry_France,NativeCountry_Germany,NativeCountry_Greece,NativeCountry_Guatemala,NativeCountry_Haiti,NativeCountry_Holand-Netherlands,NativeCountry_Honduras,NativeCountry_Hong,NativeCountry_Hungary,NativeCountry_India,NativeCountry_Iran,NativeCountry_Ireland,NativeCountry_Italy,NativeCountry_Jamaica,NativeCountry_Japan,NativeCountry_Laos,NativeCountry_Mexico,NativeCountry_Nicaragua,NativeCountry_Outlying-US(Guam-USVI-etc),NativeCountry_Peru,NativeCountry_Philippines,NativeCountry_Poland,NativeCountry_Portugal,NativeCountry_Puerto-Rico,NativeCountry_Scotland,NativeCountry_South,NativeCountry_Taiwan,NativeCountry_Thailand,NativeCountry_Trinadad&Tobago,NativeCountry_United-States,NativeCountry_Vietnam,NativeCountry_Yugoslavia
0,39,77516,12,13,2174,0,40,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,50,83311,12,13,0,0,13,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,38,215646,8,9,0,0,40,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,53,234721,6,7,0,0,40,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,28,338409,12,13,0,0,40,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,10,12,0,0,38,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
32557,40,154374,8,9,0,0,40,2,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
32558,58,151910,8,9,0,0,40,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
32559,22,201490,8,9,0,0,20,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [29]:
df.columns

Index(['Age', 'FinalWeight', 'Education', 'EducationNum', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'Income', 'Workclass_ Federal-gov', 'Workclass_ Local-gov', 'Workclass_ Never-worked', 'Workclass_ Private', 'Workclass_ Self-emp-inc', 'Workclass_ Self-emp-not-inc', 'Workclass_ State-gov', 'Workclass_ Without-pay', 'MaritalStatus_Divorced', 'MaritalStatus_Married-AF-spouse', 'MaritalStatus_Married-civ-spouse', 'MaritalStatus_Married-spouse-absent', 'MaritalStatus_Never-married', 'MaritalStatus_Separated', 'MaritalStatus_Widowed', 'Occupation_Adm-clerical', 'Occupation_Armed-Forces', 'Occupation_Craft-repair', 'Occupation_Exec-managerial', 'Occupation_Farming-fishing', 'Occupation_Handlers-cleaners', 'Occupation_Machine-op-inspct', 'Occupation_Other-service', 'Occupation_Priv-house-serv', 'Occupation_Prof-specialty', 'Occupation_Protective-serv', 'Occupation_Sales', 'Occupation_Tech-support', 'Occupation_Transport-moving', 'Relationship_Husband', 'Relationship_Not-in-family',
      

### Applying Clustering

In [37]:
# Standardizing numerical columns
numerical_columns = ['Age','FinalWeight','Education','EducationNum','CapitalGain','HoursPerWeek','Income']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [32]:
# Specifying the number of clusters
num_clusters = 2

In [35]:
# Applying K-Means
df_kmeans = df[numerical_columns]
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
df['KMeans_Cluster'] = kmeans.fit_predict(df_kmeans)

In [41]:
# Apply K-Modes (for categorical columns)
df_categorical = df[['Education', 'Income']]
kmodes = KModes(n_clusters=num_clusters, init='Cao', n_init=5, verbose=1)
df['KModes_Cluster'] = kmodes.fit_predict(df_categorical)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 21152.0


In [42]:
# Apply K-Prototypes (for mixed data)
kprototypes = KPrototypes(n_clusters=num_clusters, init='Cao', n_init=5, verbose=1)
df['KPrototypes_Cluster'] = kprototypes.fit_predict(df, categorical=[2, 3, 4, 5])

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 4275, ncost: 228370.23315186796
Run: 1, iteration: 2/100, moves: 4107, ncost: 220806.8108878607
Run: 1, iteration: 3/100, moves: 6224, ncost: 207409.6490782719
Run: 1, iteration: 4/100, moves: 2701, ncost: 205386.56986766608
Run: 1, iteration: 5/100, moves: 1142, ncost: 204852.94587198744
Run: 1, iteration: 6/100, moves: 907, ncost: 204424.44800316307
Run: 1, iteration: 7/100, moves: 1068, ncost: 203739.0214353731
Run: 1, iteration: 8/100, moves: 955, ncost: 203101.7944985887
Run: 1, iteration: 9/100, moves: 346, ncost: 202987.16308102457
Run: 1, iteration: 10/100, moves: 79, ncost: 202979.7267499268
Run: 1, iteration: 11/100, moves: 15, ncost: 202979.4076225232
Run: 1, iteration: 12/100, moves: 0, ncost: 202979.4076225232
Init: initializing centroids
Init: initializing clusters
Starting iterations..

### Evaluating clustering quality using silhouette score

In [44]:
print("Silhouette Score for KMeans:", silhouette_score(df_kmeans, df['KMeans_Cluster']))
print("Silhouette Score for KModes:", silhouette_score(df_categorical, df['KModes_Cluster']))
print("Silhouette Score for KPrototypes:", silhouette_score(df, df['KPrototypes_Cluster']))

Silhouette Score for KMeans: 0.25293576165162984
Silhouette Score for KModes: -0.03773539699054791
Silhouette Score for KPrototypes: 0.35216202432234167


In [45]:
# Display the DataFrame with cluster assignments
df

Unnamed: 0,Age,FinalWeight,Education,EducationNum,CapitalGain,CapitalLoss,HoursPerWeek,Income,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,MaritalStatus_Divorced,MaritalStatus_Married-AF-spouse,MaritalStatus_Married-civ-spouse,MaritalStatus_Married-spouse-absent,MaritalStatus_Never-married,MaritalStatus_Separated,MaritalStatus_Widowed,Occupation_Adm-clerical,Occupation_Armed-Forces,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving,Relationship_Husband,Relationship_Not-in-family,Relationship_Other-relative,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife,Race_Amer-Indian-Eskimo,Race_Asian-Pac-Islander,Race_Black,Race_Other,Race_White,Gender_Female,Gender_Male,NativeCountry_Cambodia,NativeCountry_Canada,NativeCountry_China,NativeCountry_Columbia,NativeCountry_Cuba,NativeCountry_Dominican-Republic,NativeCountry_Ecuador,NativeCountry_El-Salvador,NativeCountry_England,NativeCountry_France,NativeCountry_Germany,NativeCountry_Greece,NativeCountry_Guatemala,NativeCountry_Haiti,NativeCountry_Holand-Netherlands,NativeCountry_Honduras,NativeCountry_Hong,NativeCountry_Hungary,NativeCountry_India,NativeCountry_Iran,NativeCountry_Ireland,NativeCountry_Italy,NativeCountry_Jamaica,NativeCountry_Japan,NativeCountry_Laos,NativeCountry_Mexico,NativeCountry_Nicaragua,NativeCountry_Outlying-US(Guam-USVI-etc),NativeCountry_Peru,NativeCountry_Philippines,NativeCountry_Poland,NativeCountry_Portugal,NativeCountry_Puerto-Rico,NativeCountry_Scotland,NativeCountry_South,NativeCountry_Taiwan,NativeCountry_Thailand,NativeCountry_Trinadad&Tobago,NativeCountry_United-States,NativeCountry_Vietnam,NativeCountry_Yugoslavia,KMeans_Cluster,KModes_Cluster,KPrototypes_Cluster
0,0.042796,-1.062722,1.115451,1.128918,0.146092,0,-0.077734,-0.575691,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
1,0.880288,-1.007871,1.115451,1.128918,-0.147445,0,-2.331531,-0.575691,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
2,-0.033340,0.244693,-0.442856,-0.439738,-0.147445,0,-0.077734,-0.575691,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,1.108695,0.425240,-1.222009,-1.224066,-0.147445,0,-0.077734,-0.575691,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
4,-0.794697,1.406658,1.115451,1.128918,-0.147445,0,-0.077734,-0.575691,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,-0.870832,0.638972,0.336298,0.736754,-0.147445,0,-0.244682,-0.575691,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
32557,0.118931,-0.335252,-0.442856,-0.439738,-0.147445,0,-0.077734,1.737042,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
32558,1.489374,-0.358575,-0.442856,-0.439738,-0.147445,0,-0.077734,-0.575691,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
32559,-1.251511,0.110705,-0.442856,-0.439738,-0.147445,0,-1.747213,-0.575691,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
