In [18]:
import pandas as pd
import numpy as np

In [3]:
column_names = ['id', 'ssn', 'age', 'sex', 'painloc', 'painexer', 'relrest', 'pncaden', 'cp', 'trestbps', 'htn', 'chol', 
                'smoke', 'cigs', 'years', 'fbs', 'dm', 'famhist', 'restecg', 'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr',
                'pro', 'diuretic', 'proto', 'thaldur', 'thaltime', 'met', 'thalach', 'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy',
                'trestbpd', 'exang', 'xhypo', 'oldpeak', 'slope', 'rldv5', 'rldv5e', 'ca', 'restckm', 'exerckm', 'restef', 'restwm',
                'exeref', 'exerwm', 'thal', 'thalsev', 'thalpul', 'earlobe', 'cmo', 'cday', 'cyr', 'num', 'lmt', 'ladprox', 'laddist',
                'diag', 'cxmain', 'ramus', 'om1', 'om2', 'rcaprox', 'rcadist', 'lvx1', 'lvx2', 'lvx3','lvx4', 'lvf', 'cathef', 'junk', 'name']

In [24]:
def heart_to_df(file):
    '''
    PURPOSE   takes heart disease .data files from the UCI repository and converts to pandas dataframe
    
    ARGUMENTS
    file      file name as a string
    
    OUTPUT    pandas dataframe
    '''
    
    f = open(file, 'r')
    text = (f.read()).split('\n')
    
    entries_list = [' '.join(x) for x in zip(text[0::10], text[1::10], text[2::10], text[3::10], text[4::10],   # join every 10 elements of the list
                                             text[5::10], text[6::10], text[7::10], text[8::10], text[9::10])] 
    entries_list_split = [x.split(' ') for x in entries_list]                                                   # split each entry by spaces
    heart_df = pd.DataFrame(data = entries_list_split, columns = column_names)                                  # convert to dataframe
    heart_df['loc'] = file[:4]                                                                                  # add additional column with location name
    return(heart_df)

In [25]:
swiss = heart_to_df('switzerland.data')
hung  = heart_to_df('hungarian.data')
clev  = heart_to_df('cleveland.data')
long  = heart_to_df('long-beach-va.data')

In [26]:
combined = pd.concat([swiss, hung, clev, long], axis = 0)   # concatenate the four dataframes
combined = combined.apply(pd.to_numeric, errors = 'ignore') # convert rows to correct datatypes
combined = combined.replace(-9, np.nan)
combined

Unnamed: 0,id,ssn,age,sex,painloc,painexer,relrest,pncaden,cp,trestbps,...,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name,loc
0,3001,0,65,1,1.0,1.0,1.0,,4,115.0,...,1.0,1.0,1.0,1.0,1.0,1.0,75.00,,name,swit
1,3002,0,32,1,0.0,0.0,0.0,,1,95.0,...,1.0,1.0,1.0,1.0,5.0,1.0,63.00,,name,swit
2,3003,0,61,1,1.0,1.0,1.0,,4,105.0,...,1.0,1.0,1.0,1.0,1.0,1.0,67.00,,name,swit
3,3004,0,50,1,1.0,1.0,1.0,,4,145.0,...,1.0,1.0,1.0,1.0,5.0,4.0,36.00,,name,swit
4,3005,0,57,1,1.0,1.0,1.0,,4,110.0,...,1.0,1.0,1.0,1.0,1.0,1.0,60.00,,name,swit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,200,0,54,0,1.0,1.0,1.0,,4,127.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.76,5.6,name,long
196,201,0,62,1,0.0,0.0,0.0,,1,,...,1.0,1.0,1.0,1.0,1.0,2.0,0.62,3.5,name,long
197,202,0,55,1,1.0,1.0,1.0,,4,122.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.69,5.6,name,long
198,116,0,58,1,1.0,1.0,1.0,,4,,...,1.0,1.0,1.0,1.0,1.0,1.0,0.81,6.0,name,long


In [27]:
combined.to_csv('combined.csv')