In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Local data for each dataset

## Cleveland 

In [2]:
cleveland = 'data/processed.cleveland.data'

df_cleveland = pd.read_csv(cleveland, header=None)

columns_names = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']
df_cleveland.columns = columns_names
print(df_cleveland)

df_cleveland.replace('?', pd.np.nan, inplace=True)
count_missing = df_cleveland.isna().any(axis=1).sum()
print('Missing values:', count_missing)

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
298  45.0  1.0  1.0     110.0  264.0  0.0      0.0    132.0    0.0      1.2   
299  68.0  1.0  4.0     144.0  193.0  1.0      0.0    141.0    0.0      3.4   
300  57.0  1.0  4.0     130.0  131.0  0.0      0.0    115.0    1.0      1.2   
301  57.0  0.0  2.0     130.0  236.0  0.0      2.0    174.0    0.0      0.0   
302  38.0  1.0  3.0     138.0  175.0  0.0      0.0    173.0    0.0      0.0   

     slope   ca thal  num  
0      3.0  0.0  6.0   

  df_cleveland.replace('?', pd.np.nan, inplace=True)


In [3]:
#Use One-hot Encoder to use Hamming distance for KNN later

encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df_cleveland[['cp', 'restecg', 'slope', 'thal']])

cle_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(['cp', 'restecg', 'slope', 'thal']))

cle_encoded = pd.concat([cle_encoded, df_cleveland[['age','sex','trestbps','chol','fbs','thalach','exang','oldpeak','ca','num']]], axis=1)
cle_encoded = cle_encoded.drop('thal_nan', axis=1)

print(cle_encoded)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       1.0     0.0     0.0     0.0          0.0          0.0          1.0   
1       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
2       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
3       0.0     0.0     1.0     0.0          1.0          0.0          0.0   
4       0.0     1.0     0.0     0.0          0.0          0.0          1.0   
..      ...     ...     ...     ...          ...          ...          ...   
298     1.0     0.0     0.0     0.0          1.0          0.0          0.0   
299     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
300     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
301     0.0     1.0     0.0     0.0          0.0          0.0          1.0   
302     0.0     0.0     1.0     0.0          1.0          0.0          0.0   

     slope_1.0  slope_2.0  slope_3.0  ...   age  sex  trestbps 



In [4]:
#Normalizing data using MinMax Scaler

scaler = MinMaxScaler()
scaler.fit(cle_encoded)
scaled_cle = scaler.transform(cle_encoded)
cle_encoded = pd.DataFrame(scaled_cle, columns=cle_encoded.columns)

In [5]:
print(cle_encoded.columns)

Index(['cp_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'restecg_0.0', 'restecg_1.0',
       'restecg_2.0', 'slope_1.0', 'slope_2.0', 'slope_3.0', 'thal_3.0',
       'thal_6.0', 'thal_7.0', 'age', 'sex', 'trestbps', 'chol', 'fbs',
       'thalach', 'exang', 'oldpeak', 'ca', 'num'],
      dtype='object')


In [6]:
imputer = KNNImputer(n_neighbors=5)
df_imputed_cle = pd.DataFrame(imputer.fit_transform(cle_encoded))
df_imputed_cle.columns = cle_encoded.columns[0:]

print(df_imputed_cle)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       1.0     0.0     0.0     0.0          0.0          0.0          1.0   
1       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
2       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
3       0.0     0.0     1.0     0.0          1.0          0.0          0.0   
4       0.0     1.0     0.0     0.0          0.0          0.0          1.0   
..      ...     ...     ...     ...          ...          ...          ...   
298     1.0     0.0     0.0     0.0          1.0          0.0          0.0   
299     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
300     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
301     0.0     1.0     0.0     0.0          0.0          0.0          1.0   
302     0.0     0.0     1.0     0.0          1.0          0.0          0.0   

     slope_1.0  slope_2.0  slope_3.0  ...       age  sex  trest

In [7]:
df_imputed_cle.to_csv('Cleveland Imputed Data.csv', index=False)

## Hungarian 

In [8]:
hungarian = 'data/processed.hungarian.data'

df_hungarian = pd.read_csv(hungarian, header=None)

df_hungarian.columns = columns_names
print(df_hungarian)

df_hungarian.replace('?', pd.np.nan, inplace=True)
count_missing = df_hungarian.isna().any(axis=1).sum()
print('Missing values:', count_missing)

     age  sex  cp trestbps chol fbs restecg thalach exang  oldpeak slope ca  \
0     28    1   2      130  132   0       2     185     0      0.0     ?  ?   
1     29    1   2      120  243   0       0     160     0      0.0     ?  ?   
2     29    1   2      140    ?   0       0     170     0      0.0     ?  ?   
3     30    0   1      170  237   0       1     170     0      0.0     ?  ?   
4     31    0   2      100  219   0       1     150     0      0.0     ?  ?   
..   ...  ...  ..      ...  ...  ..     ...     ...   ...      ...   ... ..   
289   52    1   4      160  331   0       0      94     1      2.5     ?  ?   
290   54    0   3      130  294   0       1     100     1      0.0     2  ?   
291   56    1   4      155  342   1       0     150     1      3.0     2  ?   
292   58    0   2      180  393   0       0     110     1      1.0     2  ?   
293   65    1   4      130  275   0       1     115     1      1.0     2  ?   

    thal  num  
0      ?    0  
1      ?    0  
2  

  df_hungarian.replace('?', pd.np.nan, inplace=True)


In [9]:
encoder = OneHotEncoder(sparse=False)
encoded_data = encoder.fit_transform(df_hungarian[['cp', 'restecg', 'slope', 'thal']])

hung_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(['cp', 'restecg', 'slope', 'thal']))

hung_encoded = pd.concat([hung_encoded, df_hungarian[['age','sex','trestbps','chol','fbs','thalach','exang','oldpeak','ca','num']]], axis=1)

print(hung_encoded)

     cp_1  cp_2  cp_3  cp_4  restecg_0  restecg_1  restecg_2  restecg_nan  \
0     0.0   1.0   0.0   0.0        0.0        0.0        1.0          0.0   
1     0.0   1.0   0.0   0.0        1.0        0.0        0.0          0.0   
2     0.0   1.0   0.0   0.0        1.0        0.0        0.0          0.0   
3     1.0   0.0   0.0   0.0        0.0        1.0        0.0          0.0   
4     0.0   1.0   0.0   0.0        0.0        1.0        0.0          0.0   
..    ...   ...   ...   ...        ...        ...        ...          ...   
289   0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
290   0.0   0.0   1.0   0.0        0.0        1.0        0.0          0.0   
291   0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
292   0.0   1.0   0.0   0.0        1.0        0.0        0.0          0.0   
293   0.0   0.0   0.0   1.0        0.0        1.0        0.0          0.0   

     slope_1  slope_2  ...  age  sex  trestbps  chol  fbs  thalach  exang  



In [10]:
print(hung_encoded.columns)

Index(['cp_1', 'cp_2', 'cp_3', 'cp_4', 'restecg_0', 'restecg_1', 'restecg_2',
       'restecg_nan', 'slope_1', 'slope_2', 'slope_3', 'slope_nan', 'thal_3',
       'thal_6', 'thal_7', 'thal_nan', 'age', 'sex', 'trestbps', 'chol', 'fbs',
       'thalach', 'exang', 'oldpeak', 'ca', 'num'],
      dtype='object')


In [11]:
hung_encoded = hung_encoded.drop(['restecg_nan','slope_nan','thal_nan'], axis=1)
hung_encoded.columns = cle_encoded.columns

In [12]:
#Normalizing data using MinMax Scaler

scaler.fit(hung_encoded)
scaled_hung = scaler.transform(hung_encoded)
hung_encoded = pd.DataFrame(scaled_hung, columns=hung_encoded.columns)

In [13]:
imputer = KNNImputer(n_neighbors=5)
imputer.fit(df_imputed_cle)

df_imputed_hungarian = pd.DataFrame(imputer.transform(hung_encoded), columns=hung_encoded.columns)
print(df_imputed_hungarian)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       0.0     1.0     0.0     0.0          0.0          0.0          1.0   
1       0.0     1.0     0.0     0.0          1.0          0.0          0.0   
2       0.0     1.0     0.0     0.0          1.0          0.0          0.0   
3       1.0     0.0     0.0     0.0          0.0          1.0          0.0   
4       0.0     1.0     0.0     0.0          0.0          1.0          0.0   
..      ...     ...     ...     ...          ...          ...          ...   
289     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
290     0.0     0.0     1.0     0.0          0.0          1.0          0.0   
291     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
292     0.0     1.0     0.0     0.0          1.0          0.0          0.0   
293     0.0     0.0     0.0     1.0          0.0          1.0          0.0   

     slope_1.0  slope_2.0  slope_3.0  ...       age  sex  trest

In [14]:
df_imputed_hungarian.to_csv('Hungarian Imputed Data.csv', index=False)

## Switzerland 

In [15]:
switzerland = 'data/processed.switzerland.data'

df_switzerland = pd.read_csv(switzerland, header=None)

df_switzerland.columns = columns_names
print(df_switzerland)

df_switzerland.replace('?', pd.np.nan, inplace=True)
count_missing = df_switzerland.isna().any(axis=1).sum()
print('Missing values:', count_missing)

     age  sex  cp trestbps  chol fbs restecg thalach exang oldpeak slope ca  \
0     32    1   1       95     0   ?       0     127     0      .7     1  ?   
1     34    1   4      115     0   ?       ?     154     0      .2     1  ?   
2     35    1   4        ?     0   ?       0     130     1       ?     ?  ?   
3     36    1   4      110     0   ?       0     125     1       1     2  ?   
4     38    0   4      105     0   ?       0     166     0     2.8     1  ?   
..   ...  ...  ..      ...   ...  ..     ...     ...   ...     ...   ... ..   
118   70    1   4      115     0   0       1      92     1       0     2  ?   
119   70    1   4      140     0   1       0     157     1       2     2  ?   
120   72    1   3      160     0   ?       2     114     0     1.6     2  2   
121   73    0   3      160     0   0       1     121     0       0     1  ?   
122   74    1   2      145     0   ?       1     123     0     1.3     1  ?   

    thal  num  
0      ?    1  
1      ?    1  
2  

  df_switzerland.replace('?', pd.np.nan, inplace=True)


In [16]:
encoded_data = encoder.fit_transform(df_switzerland[['cp', 'restecg', 'slope', 'thal']])

swi_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(['cp', 'restecg', 'slope', 'thal']))

swi_encoded = pd.concat([swi_encoded, df_switzerland[['age','sex','trestbps','chol','fbs','thalach','exang','oldpeak','ca','num']]], axis=1)

print(swi_encoded)

     cp_1  cp_2  cp_3  cp_4  restecg_0  restecg_1  restecg_2  restecg_nan  \
0     1.0   0.0   0.0   0.0        1.0        0.0        0.0          0.0   
1     0.0   0.0   0.0   1.0        0.0        0.0        0.0          1.0   
2     0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
3     0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
4     0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
..    ...   ...   ...   ...        ...        ...        ...          ...   
118   0.0   0.0   0.0   1.0        0.0        1.0        0.0          0.0   
119   0.0   0.0   0.0   1.0        1.0        0.0        0.0          0.0   
120   0.0   0.0   1.0   0.0        0.0        0.0        1.0          0.0   
121   0.0   0.0   1.0   0.0        0.0        1.0        0.0          0.0   
122   0.0   1.0   0.0   0.0        0.0        1.0        0.0          0.0   

     slope_1  slope_2  ...  age  sex  trestbps  chol  fbs  thalach  exang  



In [17]:
print(swi_encoded.columns)

Index(['cp_1', 'cp_2', 'cp_3', 'cp_4', 'restecg_0', 'restecg_1', 'restecg_2',
       'restecg_nan', 'slope_1', 'slope_2', 'slope_3', 'slope_nan', 'thal_3',
       'thal_6', 'thal_7', 'thal_nan', 'age', 'sex', 'trestbps', 'chol', 'fbs',
       'thalach', 'exang', 'oldpeak', 'ca', 'num'],
      dtype='object')


In [18]:
swi_encoded = swi_encoded.drop(['restecg_nan','slope_nan','thal_nan'], axis=1)
swi_encoded.columns = cle_encoded.columns

In [19]:
#Normalizing data using MinMax Scaler

scaler.fit(swi_encoded)
scaled_swi = scaler.transform(swi_encoded)
swi_encoded = pd.DataFrame(scaled_swi, columns=swi_encoded.columns)

In [20]:
df_imputed_swi = pd.DataFrame(imputer.transform(swi_encoded), columns=swi_encoded.columns)

print(df_imputed_swi)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       1.0     0.0     0.0     0.0          1.0          0.0          0.0   
1       0.0     0.0     0.0     1.0          0.0          0.0          0.0   
2       0.0     0.0     0.0     1.0          1.0          0.0          0.0   
3       0.0     0.0     0.0     1.0          1.0          0.0          0.0   
4       0.0     0.0     0.0     1.0          1.0          0.0          0.0   
..      ...     ...     ...     ...          ...          ...          ...   
118     0.0     0.0     0.0     1.0          0.0          1.0          0.0   
119     0.0     0.0     0.0     1.0          1.0          0.0          0.0   
120     0.0     0.0     1.0     0.0          0.0          0.0          1.0   
121     0.0     0.0     1.0     0.0          0.0          1.0          0.0   
122     0.0     1.0     0.0     0.0          0.0          1.0          0.0   

     slope_1.0  slope_2.0  slope_3.0  ...       age  sex  trest

In [21]:
df_imputed_swi.to_csv('Switzerland Imputed Data.csv', index=False)

## Long Beach, VA

In [22]:
va = 'data/processed.va.data'

df_va = pd.read_csv(va, header=None)
df_va.columns = columns_names
print(df_va)

df_va.replace('?', pd.np.nan, inplace=True)
count_missing = df_va.isna().any(axis=1).sum()
print('Missing values:', count_missing)

     age  sex  cp trestbps chol fbs  restecg thalach exang oldpeak slope ca  \
0     63    1   4      140  260   0        1     112     1       3     2  ?   
1     44    1   4      130  209   0        1     127     0       0     ?  ?   
2     60    1   4      132  218   0        1     140     1     1.5     3  ?   
3     55    1   4      142  228   0        1     149     1     2.5     1  ?   
4     66    1   3      110  213   1        2      99     1     1.3     2  ?   
..   ...  ...  ..      ...  ...  ..      ...     ...   ...     ...   ... ..   
195   54    0   4      127  333   1        1     154     0       0     ?  ?   
196   62    1   1        ?  139   0        1       ?     ?       ?     ?  ?   
197   55    1   4      122  223   1        1     100     0       0     ?  ?   
198   58    1   4        ?  385   1        2       ?     ?       ?     ?  ?   
199   62    1   2      120  254   0        2      93     1       0     ?  ?   

    thal  num  
0      ?    2  
1      ?    0  
2  

  df_va.replace('?', pd.np.nan, inplace=True)


In [23]:
encoded_data = encoder.fit_transform(df_va[['cp', 'restecg', 'slope', 'thal']])

va_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(['cp', 'restecg', 'slope', 'thal']))

va_encoded = pd.concat([va_encoded, df_va[['age','sex','trestbps','chol','fbs','thalach','exang','oldpeak','ca','num']]], axis=1)

print(va_encoded)

     cp_1  cp_2  cp_3  cp_4  restecg_0  restecg_1  restecg_2  slope_1  \
0     0.0   0.0   0.0   1.0        0.0        1.0        0.0      0.0   
1     0.0   0.0   0.0   1.0        0.0        1.0        0.0      0.0   
2     0.0   0.0   0.0   1.0        0.0        1.0        0.0      0.0   
3     0.0   0.0   0.0   1.0        0.0        1.0        0.0      1.0   
4     0.0   0.0   1.0   0.0        0.0        0.0        1.0      0.0   
..    ...   ...   ...   ...        ...        ...        ...      ...   
195   0.0   0.0   0.0   1.0        0.0        1.0        0.0      0.0   
196   1.0   0.0   0.0   0.0        0.0        1.0        0.0      0.0   
197   0.0   0.0   0.0   1.0        0.0        1.0        0.0      0.0   
198   0.0   0.0   0.0   1.0        0.0        0.0        1.0      0.0   
199   0.0   1.0   0.0   0.0        0.0        0.0        1.0      0.0   

     slope_2  slope_3  ...  age  sex  trestbps  chol  fbs  thalach  exang  \
0        1.0      0.0  ...   63    1       140



In [24]:
print(va_encoded.columns)

Index(['cp_1', 'cp_2', 'cp_3', 'cp_4', 'restecg_0', 'restecg_1', 'restecg_2',
       'slope_1', 'slope_2', 'slope_3', 'slope_nan', 'thal_3', 'thal_6',
       'thal_7', 'thal_nan', 'age', 'sex', 'trestbps', 'chol', 'fbs',
       'thalach', 'exang', 'oldpeak', 'ca', 'num'],
      dtype='object')


In [25]:
va_encoded = va_encoded.drop(['slope_nan','thal_nan'], axis=1)
va_encoded.columns = cle_encoded.columns

In [26]:
#Normalizing data using MinMax Scaler

scaler.fit(va_encoded)
scaled_va = scaler.transform(va_encoded)
va_encoded = pd.DataFrame(scaled_va, columns=va_encoded.columns)

In [27]:
df_imputed_va = pd.DataFrame(imputer.transform(va_encoded), columns=va_encoded.columns)

print(df_imputed_va)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       0.0     0.0     0.0     1.0          0.0          1.0          0.0   
1       0.0     0.0     0.0     1.0          0.0          1.0          0.0   
2       0.0     0.0     0.0     1.0          0.0          1.0          0.0   
3       0.0     0.0     0.0     1.0          0.0          1.0          0.0   
4       0.0     0.0     1.0     0.0          0.0          0.0          1.0   
..      ...     ...     ...     ...          ...          ...          ...   
195     0.0     0.0     0.0     1.0          0.0          1.0          0.0   
196     1.0     0.0     0.0     0.0          0.0          1.0          0.0   
197     0.0     0.0     0.0     1.0          0.0          1.0          0.0   
198     0.0     0.0     0.0     1.0          0.0          0.0          1.0   
199     0.0     1.0     0.0     0.0          0.0          0.0          1.0   

     slope_1.0  slope_2.0  slope_3.0  ...       age  sex  trest

In [28]:
df_imputed_va.to_csv('Virginia Imputed Data.csv', index=False)

## Combined Data

In [29]:
files = ['data/processed.cleveland.data','data/processed.hungarian.data','data/processed.switzerland.data','data/processed.va.data']

dfs = []

for file in files:
    df = pd.read_csv(file, header=None)
    dfs.append(df)
    
combined_df = pd.concat(dfs, axis=0)
combined_df.reset_index(drop=True, inplace=True)
combined_df.columns = columns_names
print(combined_df)

combined_df.replace('?', pd.np.nan, inplace=True)
count_missing = combined_df.isna().any(axis=1).sum()
print('Missing values:', count_missing)

      age  sex   cp trestbps   chol  fbs restecg thalach exang oldpeak slope  \
0    63.0  1.0  1.0    145.0  233.0  1.0     2.0   150.0   0.0     2.3   3.0   
1    67.0  1.0  4.0    160.0  286.0  0.0     2.0   108.0   1.0     1.5   2.0   
2    67.0  1.0  4.0    120.0  229.0  0.0     2.0   129.0   1.0     2.6   2.0   
3    37.0  1.0  3.0    130.0  250.0  0.0     0.0   187.0   0.0     3.5   3.0   
4    41.0  0.0  2.0    130.0  204.0  0.0     2.0   172.0   0.0     1.4   1.0   
..    ...  ...  ...      ...    ...  ...     ...     ...   ...     ...   ...   
915  54.0  0.0  4.0      127    333    1       1     154     0       0     ?   
916  62.0  1.0  1.0        ?    139    0       1       ?     ?       ?     ?   
917  55.0  1.0  4.0      122    223    1       1     100     0       0     ?   
918  58.0  1.0  4.0        ?    385    1       2       ?     ?       ?     ?   
919  62.0  1.0  2.0      120    254    0       2      93     1       0     ?   

      ca thal  num  
0    0.0  6.0    0

  combined_df.replace('?', pd.np.nan, inplace=True)


In [30]:
combined_df = combined_df.astype(float)
print(combined_df)

      age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0    63.0  1.0  1.0     145.0  233.0  1.0      2.0    150.0    0.0      2.3   
1    67.0  1.0  4.0     160.0  286.0  0.0      2.0    108.0    1.0      1.5   
2    67.0  1.0  4.0     120.0  229.0  0.0      2.0    129.0    1.0      2.6   
3    37.0  1.0  3.0     130.0  250.0  0.0      0.0    187.0    0.0      3.5   
4    41.0  0.0  2.0     130.0  204.0  0.0      2.0    172.0    0.0      1.4   
..    ...  ...  ...       ...    ...  ...      ...      ...    ...      ...   
915  54.0  0.0  4.0     127.0  333.0  1.0      1.0    154.0    0.0      0.0   
916  62.0  1.0  1.0       NaN  139.0  0.0      1.0      NaN    NaN      NaN   
917  55.0  1.0  4.0     122.0  223.0  1.0      1.0    100.0    0.0      0.0   
918  58.0  1.0  4.0       NaN  385.0  1.0      2.0      NaN    NaN      NaN   
919  62.0  1.0  2.0     120.0  254.0  0.0      2.0     93.0    1.0      0.0   

     slope   ca  thal  num  
0      3.0  0.0   6.0 

In [31]:
encoded_data = encoder.fit_transform(combined_df[['cp', 'restecg', 'slope', 'thal']])

df_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(['cp', 'restecg', 'slope', 'thal']))

df_encoded = pd.concat([df_encoded, combined_df[['age','sex','trestbps','chol','fbs','thalach','exang','oldpeak','ca','num']]], axis=1)

print(df_encoded)



     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       1.0     0.0     0.0     0.0          0.0          0.0          1.0   
1       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
2       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
3       0.0     0.0     1.0     0.0          1.0          0.0          0.0   
4       0.0     1.0     0.0     0.0          0.0          0.0          1.0   
..      ...     ...     ...     ...          ...          ...          ...   
915     0.0     0.0     0.0     1.0          0.0          1.0          0.0   
916     1.0     0.0     0.0     0.0          0.0          1.0          0.0   
917     0.0     0.0     0.0     1.0          0.0          1.0          0.0   
918     0.0     0.0     0.0     1.0          0.0          0.0          1.0   
919     0.0     1.0     0.0     0.0          0.0          0.0          1.0   

     restecg_nan  slope_1.0  slope_2.0  ...   age  sex  trestbp

In [32]:
print(df_encoded.columns)

Index(['cp_1.0', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'restecg_0.0', 'restecg_1.0',
       'restecg_2.0', 'restecg_nan', 'slope_1.0', 'slope_2.0', 'slope_3.0',
       'slope_nan', 'thal_3.0', 'thal_6.0', 'thal_7.0', 'thal_nan', 'age',
       'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak', 'ca',
       'num'],
      dtype='object')


In [33]:
df_encoded = df_encoded.drop(['restecg_nan','slope_nan','thal_nan'], axis=1)
df_encoded.columns = cle_encoded.columns

In [34]:
#Normalizing data using MinMax Scaler

scaler.fit(df_encoded)
scaled_df = scaler.transform(df_encoded)
df_encoded = pd.DataFrame(scaled_df, columns=df_encoded.columns)

In [35]:
#Imputing missing values with 5-nearest neighbor

df_imputed = pd.DataFrame(imputer.transform(df_encoded), columns=df_encoded.columns)

print(df_imputed)

     cp_1.0  cp_2.0  cp_3.0  cp_4.0  restecg_0.0  restecg_1.0  restecg_2.0  \
0       1.0     0.0     0.0     0.0          0.0          0.0          1.0   
1       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
2       0.0     0.0     0.0     1.0          0.0          0.0          1.0   
3       0.0     0.0     1.0     0.0          1.0          0.0          0.0   
4       0.0     1.0     0.0     0.0          0.0          0.0          1.0   
..      ...     ...     ...     ...          ...          ...          ...   
915     0.0     0.0     0.0     1.0          0.0          1.0          0.0   
916     1.0     0.0     0.0     0.0          0.0          1.0          0.0   
917     0.0     0.0     0.0     1.0          0.0          1.0          0.0   
918     0.0     0.0     0.0     1.0          0.0          0.0          1.0   
919     0.0     1.0     0.0     0.0          0.0          0.0          1.0   

     slope_1.0  slope_2.0  slope_3.0  ...       age  sex  trest

In [36]:
df_imputed.to_csv('Combined Imputed Data.csv', index=False)