In [104]:
import numpy as np
import pandas as pd

In [105]:
people = {
    'email': ["dui.fusce.diam@icloud.edu", None, "orci.lobortis@icloud.net", "Missing", "quis@aol.org", "rutrum.magna@hotmail.com", None],
    'name': ["Nolan Bernard", "Hayes Noble", "Joseph Mullins", "Cleo Mathews", "Owen Cantu", 'NA', np.nan],
    'postalZip': ["54670", "59982", "72241", "3613", np.nan, "442851", None],
    'country': ["Ukraine", "Cyprus", "Philippines", "Indonesia", "India", "Morocco", None],
    'age': ['23', '56', '41', '33', np.nan, '52', '29']
}

In [106]:
df = pd.DataFrame(people)

df

Unnamed: 0,email,name,postalZip,country,age
0,dui.fusce.diam@icloud.edu,Nolan Bernard,54670.0,Ukraine,23.0
1,,Hayes Noble,59982.0,Cyprus,56.0
2,orci.lobortis@icloud.net,Joseph Mullins,72241.0,Philippines,41.0
3,Missing,Cleo Mathews,3613.0,Indonesia,33.0
4,quis@aol.org,Owen Cantu,,India,
5,rutrum.magna@hotmail.com,,442851.0,Morocco,52.0
6,,,,,29.0


# Cleaning

### Replace what could be considered as NaN

In [107]:
df.replace({
    'NA': np.nan,
    'Missing': np.nan
}, inplace=True)

df

Unnamed: 0,email,name,postalZip,country,age
0,dui.fusce.diam@icloud.edu,Nolan Bernard,54670.0,Ukraine,23.0
1,,Hayes Noble,59982.0,Cyprus,56.0
2,orci.lobortis@icloud.net,Joseph Mullins,72241.0,Philippines,41.0
3,,Cleo Mathews,3613.0,Indonesia,33.0
4,quis@aol.org,Owen Cantu,,India,
5,rutrum.magna@hotmail.com,,442851.0,Morocco,52.0
6,,,,,29.0


### Drop

In [108]:
df.dropna()

Unnamed: 0,email,name,postalZip,country,age
0,dui.fusce.diam@icloud.edu,Nolan Bernard,54670,Ukraine,23
2,orci.lobortis@icloud.net,Joseph Mullins,72241,Philippines,41


In [109]:
df.dropna(axis='columns')

0
1
2
3
4
5
6


In [110]:
df.dropna(how='all') # All columns

Unnamed: 0,email,name,postalZip,country,age
0,dui.fusce.diam@icloud.edu,Nolan Bernard,54670.0,Ukraine,23.0
1,,Hayes Noble,59982.0,Cyprus,56.0
2,orci.lobortis@icloud.net,Joseph Mullins,72241.0,Philippines,41.0
3,,Cleo Mathews,3613.0,Indonesia,33.0
4,quis@aol.org,Owen Cantu,,India,
5,rutrum.magna@hotmail.com,,442851.0,Morocco,52.0
6,,,,,29.0


In [111]:
df.dropna(how='all', subset=['email', 'country'], inplace=True) # <all> now means email & country columns

df

Unnamed: 0,email,name,postalZip,country,age
0,dui.fusce.diam@icloud.edu,Nolan Bernard,54670.0,Ukraine,23.0
1,,Hayes Noble,59982.0,Cyprus,56.0
2,orci.lobortis@icloud.net,Joseph Mullins,72241.0,Philippines,41.0
3,,Cleo Mathews,3613.0,Indonesia,33.0
4,quis@aol.org,Owen Cantu,,India,
5,rutrum.magna@hotmail.com,,442851.0,Morocco,52.0


In [112]:
df.isna()

Unnamed: 0,email,name,postalZip,country,age
0,False,False,False,False,False
1,True,False,False,False,False
2,False,False,False,False,False
3,True,False,False,False,False
4,False,False,True,False,True
5,False,True,False,False,False


In [113]:
df.fillna('-')

Unnamed: 0,email,name,postalZip,country,age
0,dui.fusce.diam@icloud.edu,Nolan Bernard,54670,Ukraine,23
1,-,Hayes Noble,59982,Cyprus,56
2,orci.lobortis@icloud.net,Joseph Mullins,72241,Philippines,41
3,-,Cleo Mathews,3613,Indonesia,33
4,quis@aol.org,Owen Cantu,-,India,-
5,rutrum.magna@hotmail.com,-,442851,Morocco,52


In [114]:
# df['postalZip'] = df['postalZip'].fillna('0')

# Casting

In [115]:
df.dtypes

email        object
name         object
postalZip    object
country      object
age          object
dtype: object

In [116]:
# df['age'].mean() # --> Error

- Convert age column to float (not int because NaN is a float)

In [120]:
df['age'] = df['age'].astype(float)

df.dtypes

email         object
name          object
postalZip     object
country       object
age          float64
dtype: object

In [121]:
df['age'].mean() # --> Now it works

41.0