In [1]:
import pandas as pd 

#### Load the Echocardiogram dataset
Source: http://archive.ics.uci.edu/ml/datasets/Echocardiogram

In [2]:
ec_data = pd.read_csv('./datasets/echocardiogram.csv')

ec_data.head()

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,Hendrik,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,Abraham,1,0.0
2,3.0,1.0,,0.0,,12.0,,6.0,3.0,0.14,Candi,2,
3,16.0,0.0,55.0,0.0,0.26,4.0,3.42,14.0,1.0,1.0,Adam,1,0.0
4,57.0,0.0,60.0,0.0,0.253,12.062,4.603,16.0,1.45,0.788,Barret,1,0.0


#### Check number of nulls in the "age" column

In [3]:
ec_data['age'].isnull().sum()

7

#### Check number of non-nulls in the "age" column

In [4]:
ec_data['age'].notnull().sum()

126

#### Replace all nulls in the DataFrame with 0
This will replace null values not just in the age column but in other cells across the DataFrame

In [5]:
print('NaN Replaced with "0" :')

ec_data_filled = ec_data.fillna(0)

ec_data_filled.head()

NaN Replaced with "0" :


Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,Hendrik,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,Abraham,1,0.0
2,3.0,1.0,0.0,0.0,0.0,12.0,0.0,6.0,3.0,0.14,Candi,2,0.0
3,16.0,0.0,55.0,0.0,0.26,4.0,3.42,14.0,1.0,1.0,Adam,1,0.0
4,57.0,0.0,60.0,0.0,0.253,12.062,4.603,16.0,1.45,0.788,Barret,1,0.0


#### There are no longer nulls in the age column

In [6]:
ec_data_filled['age'].isnull().sum()

0

#### All 133 rows in the age column are now non-null

In [7]:
ec_data_filled['age'].notnull().sum()

133

#### The dropna() function
Rather than filling missing values with some explicit value, one can simply decide to drop any rows containing missing values. The dropna() function will drop entire rows even if a single cell contains a missing value. 

We can ask Pandas to drop only rows which contain all nulls by setting the "how" argument to a value of 'all'.

In [8]:
ec_data_no_missing = ec_data.dropna()

ec_data_no_missing.head()

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,Hendrik,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,Abraham,1,0.0
3,16.0,0.0,55.0,0.0,0.26,4.0,3.42,14.0,1.0,1.0,Adam,1,0.0
4,57.0,0.0,60.0,0.0,0.253,12.062,4.603,16.0,1.45,0.788,Barret,1,0.0
5,19.0,1.0,57.0,0.0,0.16,22.0,5.75,18.0,2.25,0.571,Axel,1,0.0


#### There is no null data in the age column

In [9]:
ec_data_no_missing['age'].isnull().sum()

0

#### But the number of rows in the DataFrame has dropped considerably
It's not just the rows with null ages which were dropped but rows with nulls in any cell

In [10]:
ec_data_no_missing['age'].notnull().sum()

61

### Replacing values
It's not just nulls which can be replaced, but any values can be substituted in the DataFrame

#### Examine the DataFrame again, specifically the "name" column

In [11]:
ec_data.head(10)

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,Hendrik,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,Abraham,1,0.0
2,3.0,1.0,,0.0,,12.0,,6.0,3.0,0.14,Candi,2,
3,16.0,0.0,55.0,0.0,0.26,4.0,3.42,14.0,1.0,1.0,Adam,1,0.0
4,57.0,0.0,60.0,0.0,0.253,12.062,4.603,16.0,1.45,0.788,Barret,1,0.0
5,19.0,1.0,57.0,0.0,0.16,22.0,5.75,18.0,2.25,0.571,Axel,1,0.0
6,26.0,0.0,68.0,0.0,0.26,5.0,4.31,12.0,1.0,0.857,Bailey,1,0.0
7,13.0,0.0,62.0,0.0,0.23,31.0,5.43,22.5,1.875,0.857,Cesar,1,0.0
8,50.0,0.0,60.0,0.0,0.33,8.0,5.25,14.0,1.0,1.0,Keith,1,0.0
9,19.0,0.0,46.0,0.0,0.34,0.0,5.09,16.0,1.14,1.003,Davon,1,0.0


#### Use the replace() function to substitute values
We replace some of the names in the data. We replace the first names with first names plus a last name

In [12]:
ec_data_replaced = ec_data.replace({'Candi' : 'Candi Olivarez',
                                    'Adam' : 'Adam Schmidt', 
                                    'Barret' : 'Barret Gibbs', 
                                    'Axel' : 'Axel Kaunda',
                                    'Cesar' : 'Cesar Rodrigeuz'
                                    })

ec_data_replaced.head(10)

Unnamed: 0,survival,alive,age,pericardialeffusion,fractionalshortening,epss,lvdd,wallmotion-score,wallmotion-index,mult,name,group,aliveat1
0,11.0,0.0,71.0,0.0,0.26,9.0,4.6,14.0,1.0,1.0,Hendrik,1,0.0
1,19.0,0.0,72.0,0.0,0.38,6.0,4.1,14.0,1.7,0.588,Abraham,1,0.0
2,3.0,1.0,,0.0,,12.0,,6.0,3.0,0.14,Candi Olivarez,2,
3,16.0,0.0,55.0,0.0,0.26,4.0,3.42,14.0,1.0,1.0,Adam Schmidt,1,0.0
4,57.0,0.0,60.0,0.0,0.253,12.062,4.603,16.0,1.45,0.788,Barret Gibbs,1,0.0
5,19.0,1.0,57.0,0.0,0.16,22.0,5.75,18.0,2.25,0.571,Axel Kaunda,1,0.0
6,26.0,0.0,68.0,0.0,0.26,5.0,4.31,12.0,1.0,0.857,Bailey,1,0.0
7,13.0,0.0,62.0,0.0,0.23,31.0,5.43,22.5,1.875,0.857,Cesar Rodrigeuz,1,0.0
8,50.0,0.0,60.0,0.0,0.33,8.0,5.25,14.0,1.0,1.0,Keith,1,0.0
9,19.0,0.0,46.0,0.0,0.34,0.0,5.09,16.0,1.14,1.003,Davon,1,0.0
