In [15]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Local data for each dataset

## Cleveland 

In [8]:
cleveland = 'data/processed.cleveland.data'

df_cleveland = pd.read_csv(cleveland, header=None)

columns_names = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
df_cleveland.columns = columns_names
print(df_cleveland)

df_cleveland.replace('?', pd.np.nan, inplace=True)
count_missing = df_cleveland.isna().any(axis=1).sum()
print('Missing values:', count_missing)

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
298  45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
299  68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
300  57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
301  57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   
302  38.0  1.0  3.0     138.0  175.0  0.0      0.0    173.0    0.0      0.0   

     slope   ca thal  num  
0      3.0  0.0  6.0   

  df_cleveland.replace('?', pd.np.nan, inplace=True)


In [27]:
#Use One-hot Encoder to use Hamming distance for KNN later

encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df_cleveland[['cp', 'restecg', 'slope', 'thal']])

cle_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(['cp', 'restecg', 'slope', 'thal']))

cle_encoded = pd.concat([cle_encoded, df_cleveland[['age','sex','trestbps','chol','fbs','thalach','exang','oldpeak','ca','num']]], axis=1)
cle_encoded = cle_encoded.drop('thal_nan', axis=1)

print(cle_encoded)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       1.0     0.0     0.0     0.0          0.0          0.0          1.0   
1       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
2       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
3       0.0     0.0     1.0     0.0          1.0          0.0          0.0   
4       0.0     1.0     0.0     0.0          0.0          0.0          1.0   
..      ...     ...     ...     ...          ...          ...          ...   
298     1.0     0.0     0.0     0.0          1.0          0.0          0.0   
299     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
300     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
301     0.0     1.0     0.0     0.0          0.0          0.0          1.0   
302     0.0     0.0     1.0     0.0          1.0          0.0          0.0   

     slope_1.0  slope_2.0  slope_3.0  ...   age  sex  trestbps 



In [25]:
imputer = KNNImputer(n_neighbors=5)
df_imputed_cle = pd.DataFrame(imputer.fit_transform(cle_encoded))
df_imputed_cle.columns = cle_encoded.columns[0:]

print(df_imputed_cle)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       1.0     0.0     0.0     0.0          0.0          0.0          1.0   
1       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
2       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
3       0.0     0.0     1.0     0.0          1.0          0.0          0.0   
4       0.0     1.0     0.0     0.0          0.0          0.0          1.0   
..      ...     ...     ...     ...          ...          ...          ...   
298     1.0     0.0     0.0     0.0          1.0          0.0          0.0   
299     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
300     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
301     0.0     1.0     0.0     0.0          0.0          0.0          1.0   
302     0.0     0.0     1.0     0.0          1.0          0.0          0.0   

     slope_1.0  slope_2.0  slope_3.0  ...   age  sex  trestbps 

In [15]:
df_imputed_cle.to_csv('Cleveland Imputed Data.csv', index=False)

## Hungarian 

In [14]:
hungarian = 'data/processed.hungarian.data'

df_hungarian = pd.read_csv(hungarian, header=None)

df_hungarian.columns = columns_names
print(df_hungarian)

df_hungarian.replace('?', pd.np.nan, inplace=True)
count_missing = df_hungarian.isna().any(axis=1).sum()
print('Missing values:', count_missing)

     age  sex  cp trestbps chol fbs restecg thalach exang  oldpeak slope ca  \
0     28    1   2      130  132   0       2     185     0      0.0     ?  ?   
1     29    1   2      120  243   0       0     160     0      0.0     ?  ?   
2     29    1   2      140    ?   0       0     170     0      0.0     ?  ?   
3     30    0   1      170  237   0       1     170     0      0.0     ?  ?   
4     31    0   2      100  219   0       1     150     0      0.0     ?  ?   
..   ...  ...  ..      ...  ...  ..     ...     ...   ...      ...   ... ..   
289   52    1   4      160  331   0       0      94     1      2.5     ?  ?   
290   54    0   3      130  294   0       1     100     1      0.0     2  ?   
291   56    1   4      155  342   1       0     150     1      3.0     2  ?   
292   58    0   2      180  393   0       0     110     1      1.0     2  ?   
293   65    1   4      130  275   0       1     115     1      1.0     2  ?   

    thal  num  
0      ?    0  
1      ?    0  
2  

  df_hungarian.replace('?', pd.np.nan, inplace=True)


In [16]:
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df_hungarian[['cp', 'restecg', 'slope', 'thal']])

hung_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(['cp', 'restecg', 'slope', 'thal']))

hung_encoded = pd.concat([hung_encoded, df_hungarian[['age','sex','trestbps','chol','fbs','thalach','exang','oldpeak','ca','num']]], axis=1)

print(hung_encoded)

     cp_1  cp_2  cp_3  cp_4  restecg_0  restecg_1  restecg_2  restecg_nan  \
0     0.0   1.0   0.0   0.0        0.0        0.0        1.0          0.0   
1     0.0   1.0   0.0   0.0        1.0        0.0        0.0          0.0   
2     0.0   1.0   0.0   0.0        1.0        0.0        0.0          0.0   
3     1.0   0.0   0.0   0.0        0.0        1.0        0.0          0.0   
4     0.0   1.0   0.0   0.0        0.0        1.0        0.0          0.0   
..    ...   ...   ...   ...        ...        ...        ...          ...   
289   0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
290   0.0   0.0   1.0   0.0        0.0        1.0        0.0          0.0   
291   0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
292   0.0   1.0   0.0   0.0        1.0        0.0        0.0          0.0   
293   0.0   0.0   0.0   1.0        0.0        1.0        0.0          0.0   

     slope_1  slope_2  ...  age  sex  trestbps  chol  fbs  thalach  exang  



In [28]:
print(hung_encoded.columns)

Index(['cp_1', 'cp_2', 'cp_3', 'cp_4', 'restecg_0', 'restecg_1', 'restecg_2',
       'restecg_nan', 'slope_1', 'slope_2', 'slope_3', 'slope_nan', 'thal_3',
       'thal_6', 'thal_7', 'thal_nan', 'age', 'sex', 'trestbps', 'chol', 'fbs',
       'thalach', 'exang', 'oldpeak', 'ca', 'num'],
      dtype='object')


In [29]:
imputer = KNNImputer(n_neighbors=5)
imputer.fit(df_imputed_cle)

df_imputed_hungarian = pd.DataFrame(imputer.transform(hung_encoded), columns=hung_encoded.columns)
print(df_imputed_hungarian)

Feature names unseen at fit time:
- cp_1
- cp_2
- cp_3
- cp_4
- restecg_0
- ...
Feature names seen at fit time, yet now missing:
- cp_1.0
- cp_2.0
- cp_3.0
- cp_4.0
- restecg_0.0
- ...



ValueError: X has 26 features, but KNNImputer is expecting 23 features as input.

In [16]:
df_imputed_hungarian.to_csv('Hungarian Imputed Data.csv', index=False)

## Switzerland 

In [5]:
switzerland = '/Users/votri/Downloads/Pakula ML/processed.switzerland.data'

df_switzerland = pd.read_csv(switzerland, header=None)
print(df_switzerland)

df_switzerland.replace('?', pd.np.nan, inplace=True)
count_missing = df_switzerland.isna().any(axis=1).sum()
print('Missing values:', count_missing)

     0   1   2    3   4  5  6    7  8    9  10 11 12  13
0    32   1   1   95   0  ?  0  127  0   .7  1  ?  ?   1
1    34   1   4  115   0  ?  ?  154  0   .2  1  ?  ?   1
2    35   1   4    ?   0  ?  0  130  1    ?  ?  ?  7   3
3    36   1   4  110   0  ?  0  125  1    1  2  ?  6   1
4    38   0   4  105   0  ?  0  166  0  2.8  1  ?  ?   2
..   ..  ..  ..  ...  .. .. ..  ... ..  ... .. .. ..  ..
118  70   1   4  115   0  0  1   92  1    0  2  ?  7   1
119  70   1   4  140   0  1  0  157  1    2  2  ?  7   3
120  72   1   3  160   0  ?  2  114  0  1.6  2  2  ?   0
121  73   0   3  160   0  0  1  121  0    0  1  ?  3   1
122  74   1   2  145   0  ?  1  123  0  1.3  1  ?  ?   1

[123 rows x 14 columns]
Missing values: 

  df_switzerland.replace('?', pd.np.nan, inplace=True)


123


In [6]:
df_imputed_swi = pd.DataFrame(imputer.transform(df_switzerland), columns=df_switzerland.columns)

print(df_imputed_swi)

       0    1    2      3    4    5    6      7    8     9    10   11   12  \
0    32.0  1.0  1.0   95.0  0.0  0.2  0.0  127.0  0.0  0.70  1.0  1.0  4.6   
1    34.0  1.0  4.0  115.0  0.0  0.2  0.4  154.0  0.0  0.20  1.0  1.0  4.6   
2    35.0  1.0  4.0  123.6  0.0  0.2  0.0  130.0  1.0  0.88  1.6  1.0  7.0   
3    36.0  1.0  4.0  110.0  0.0  0.2  0.0  125.0  1.0  1.00  2.0  1.0  6.0   
4    38.0  0.0  4.0  105.0  0.0  0.2  0.0  166.0  0.0  2.80  1.0  1.0  4.6   
..    ...  ...  ...    ...  ...  ...  ...    ...  ...   ...  ...  ...  ...   
118  70.0  1.0  4.0  115.0  0.0  0.0  1.0   92.0  1.0  0.00  2.0  1.0  7.0   
119  70.0  1.0  4.0  140.0  0.0  1.0  0.0  157.0  1.0  2.00  2.0  1.0  7.0   
120  72.0  1.0  3.0  160.0  0.0  0.2  2.0  114.0  0.0  1.60  2.0  2.0  4.6   
121  73.0  0.0  3.0  160.0  0.0  0.0  1.0  121.0  0.0  0.00  1.0  1.0  3.0   
122  74.0  1.0  2.0  145.0  0.0  0.2  1.0  123.0  0.0  1.30  1.0  1.0  4.6   

      13  
0    1.0  
1    1.0  
2    3.0  
3    1.0  
4    2.0

In [17]:
df_imputed_swi.to_csv('Switzerland Imputed Data.csv', index=False)

## Long Beach, VA

In [7]:
va = '/Users/votri/Downloads/Pakula ML/processed.va.data'

df_va = pd.read_csv(va, header=None)
print(df_va)

df_va.replace('?', pd.np.nan, inplace=True)
count_missing = df_va.isna().any(axis=1).sum()
print('Missing values:', count_missing)

     0   1   2    3    4  5   6    7  8    9  10 11 12  13
0    63   1   4  140  260  0   1  112  1    3  2  ?  ?   2
1    44   1   4  130  209  0   1  127  0    0  ?  ?  ?   0
2    60   1   4  132  218  0   1  140  1  1.5  3  ?  ?   2
3    55   1   4  142  228  0   1  149  1  2.5  1  ?  ?   1
4    66   1   3  110  213  1   2   99  1  1.3  2  ?  ?   0
..   ..  ..  ..  ...  ... ..  ..  ... ..  ... .. .. ..  ..
195  54   0   4  127  333  1   1  154  0    0  ?  ?  ?   1
196  62   1   1    ?  139  0   1    ?  ?    ?  ?  ?  ?   0
197  55   1   4  122  223  1   1  100  0    0  ?  ?  6   2
198  58   1   4    ?  385  1   2    ?  ?    ?  ?  ?  ?   0
199  62   1   2  120  254  0   2   93  1    0  ?  ?  ?   1

[200 rows x 14 columns]
Missing values: 199


  df_va.replace('?', pd.np.nan, inplace=True)


In [8]:
df_imputed_va = pd.DataFrame(imputer.transform(df_va), columns=df_va.columns)

print(df_imputed_va)

       0    1    2      3      4    5    6      7    8     9    10    11  \
0    63.0  1.0  4.0  140.0  260.0  0.0  1.0  112.0  1.0  3.00  2.0  0.80   
1    44.0  1.0  4.0  130.0  209.0  0.0  1.0  127.0  0.0  0.00  1.6  1.00   
2    60.0  1.0  4.0  132.0  218.0  0.0  1.0  140.0  1.0  1.50  3.0  1.32   
3    55.0  1.0  4.0  142.0  228.0  0.0  1.0  149.0  1.0  2.50  1.0  0.40   
4    66.0  1.0  3.0  110.0  213.0  1.0  2.0   99.0  1.0  1.30  2.0  0.40   
..    ...  ...  ...    ...    ...  ...  ...    ...  ...   ...  ...   ...   
195  54.0  0.0  4.0  127.0  333.0  1.0  1.0  154.0  0.0  0.00  1.2  0.60   
196  62.0  1.0  1.0  123.6  139.0  0.0  1.0  142.8  0.2  0.88  1.6  1.00   
197  55.0  1.0  4.0  122.0  223.0  1.0  1.0  100.0  0.0  0.00  1.8  0.20   
198  58.0  1.0  4.0  140.8  385.0  1.0  2.0  155.0  0.4  1.70  1.6  1.00   
199  62.0  1.0  2.0  120.0  254.0  0.0  2.0   93.0  1.0  0.00  2.2  1.20   

       12   13  
0    7.00  2.0  
1    5.68  0.0  
2    6.00  2.0  
3    3.60  1.0  
4 

In [18]:
df_imputed_va.to_csv('Virginia Imputed Data.csv', index=False)

## Combined Data

In [10]:
combined_df = pd.concat([df_imputed_cle, df_imputed_hungarian, df_imputed_swi, df_imputed_va], ignore_index=True)
combined_df.reset_index(drop=True, inplace=True)
print(combined_df)

       0    1    2      3      4    5    6      7    8     9    10   11   12  \
0    63.0  1.0  1.0  145.0  233.0  1.0  2.0  150.0  0.0  2.30  3.0  0.0  6.0   
1    67.0  1.0  4.0  160.0  286.0  0.0  2.0  108.0  1.0  1.50  2.0  3.0  3.0   
2    67.0  1.0  4.0  120.0  229.0  0.0  2.0  129.0  1.0  2.60  2.0  2.0  7.0   
3    37.0  1.0  3.0  130.0  250.0  0.0  0.0  187.0  0.0  3.50  3.0  0.0  3.0   
4    41.0  0.0  2.0  130.0  204.0  0.0  2.0  172.0  0.0  1.40  1.0  0.0  3.0   
..    ...  ...  ...    ...    ...  ...  ...    ...  ...   ...  ...  ...  ...   
915  54.0  0.0  4.0  127.0  333.0  1.0  1.0  154.0  0.0  0.00  1.2  0.6  3.8   
916  62.0  1.0  1.0  123.6  139.0  0.0  1.0  142.8  0.2  0.88  1.6  1.0  4.6   
917  55.0  1.0  4.0  122.0  223.0  1.0  1.0  100.0  0.0  0.00  1.8  0.2  6.0   
918  58.0  1.0  4.0  140.8  385.0  1.0  2.0  155.0  0.4  1.70  1.6  1.0  4.6   
919  62.0  1.0  2.0  120.0  254.0  0.0  2.0   93.0  1.0  0.00  2.2  1.2  6.2   

      13  
0    0.0  
1    2.0  
2    1

In [19]:
combined_df.to_csv('Combined Imputed Data.csv', index=False)

# Rounding data

## Cleveland

In [3]:
columns_names = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']

In [84]:
def rounding_data(x):
    for i in range(x.shape[0]):
        #cleveland
        x['ca'][i] = x['ca'][i].round(0)
        x['thal'][i] = np.array([6,3,7])[np.argmin(abs(np.array([6,3,7])- x['thal'][i]))]
        #hungarian
        x['fbs'][i] = x['fbs'][i].round(0)
        x['restecg'][i] = x['restecg'][i].round(0)
        x['exang'][i] = x['exang'][i].round(0)
        x['slope'][i] = x['slope'][i].round(0)
    return x

In [81]:
path_list = ['data/Cleveland Imputed Data.csv','data/Hungarian Imputed Data.csv','data/Switzerland Imputed Data.csv','data/Virginia Imputed Data.csv']

In [91]:
def process_data(path):
    df = pd.read_csv(path)
    df.columns = columns_names
    df_rounded = rounding_data(df)
    df_rounded.to_csv(str(path[5:][:-17]) + '_rounded.csv',index=False)

In [92]:
for i in path_list:
    process_data(i)