In [1]:
import pandas as pd
import numpy as np

In [2]:
people = {
    'first': ['ali', 'omar', 'sayed', 'mohamed', np.nan, None, 'NA'],
    'last': ['gamal', 'khaled', 'mostafa', 'palsel', np.nan, np.nan, 'Missing'],
    'email': ['ali@gmail.com', 'omar@gmail.com', 'sayed@gmail.com', None, np.nan, 'mohamed@gamil.com', 'NA' ],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [16]:
df = pd.DataFrame(people)
# here two lines we handle valuse that is not none or nan
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [4]:
df

Unnamed: 0,first,last,email,age
0,ali,gamal,ali@gmail.com,33
1,omar,khaled,omar@gmail.com,55
2,sayed,mostafa,sayed@gmail.com,63
3,mohamed,palsel,,36
4,,,,
5,,,mohamed@gamil.com,
6,,Missing,,Missing


# drop null values

In [5]:
# here we drop row that contain any missing value in one of its columns
df.dropna()

Unnamed: 0,first,last,email,age
0,ali,gamal,ali@gmail.com,33
1,omar,khaled,omar@gmail.com,55
2,sayed,mostafa,sayed@gmail.com,63
6,,Missing,,Missing


In [6]:
# here we drop row that contain any missing value in one of its columns (this is default and can edit it)
df.dropna(axis='index', how='any')

Unnamed: 0,first,last,email,age
0,ali,gamal,ali@gmail.com,33
1,omar,khaled,omar@gmail.com,55
2,sayed,mostafa,sayed@gmail.com,63
6,,Missing,,Missing


In [26]:
# here we drop columns that contain any missing value in one of its rows(because all columns have at least on missing valuy all data droped)
df.dropna(axis='columns', how='any')

0
1
2
3
4
5
6


In [7]:
# here we drop columns if all values of the columns is missing
df.dropna(axis='columns', how='all')

Unnamed: 0,first,last,email,age
0,ali,gamal,ali@gmail.com,33
1,omar,khaled,omar@gmail.com,55
2,sayed,mostafa,sayed@gmail.com,63
3,mohamed,palsel,,36
4,,,,
5,,,mohamed@gamil.com,
6,,Missing,,Missing


In [8]:
# here we drop row if email value is null(we select column by subset attr)
df.dropna(axis='index', how='any', subset=['email'])

Unnamed: 0,first,last,email,age
0,ali,gamal,ali@gmail.com,33
1,omar,khaled,omar@gmail.com,55
2,sayed,mostafa,sayed@gmail.com,63
5,,,mohamed@gamil.com,
6,,Missing,,Missing


# display null values

In [9]:
# the cell that contain null value is true
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,False,False,False,False


In [10]:
# we can use this method also
df.isnull()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,False,False,False,False


In [11]:
# we can sum method to count null values for each columns
df.isnull().sum()

first    2
last     2
email    2
age      2
dtype: int64

# fill null values

In [12]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,ali,gamal,ali@gmail.com,33
1,omar,khaled,omar@gmail.com,55
2,sayed,mostafa,sayed@gmail.com,63
3,mohamed,palsel,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,mohamed@gamil.com,MISSING
6,,Missing,,Missing


In [13]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,ali,gamal,ali@gmail.com,33
1,omar,khaled,omar@gmail.com,55
2,sayed,mostafa,sayed@gmail.com,63
3,mohamed,palsel,0,36
4,0,0,0,0
5,0,0,mohamed@gamil.com,0
6,,Missing,,Missing


#  data types 

In [14]:
# display data dypes for each columns
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [17]:
# convert data type for column 
df['age'] = df['age'].astype(float)

In [18]:
df['age'].mean()

46.75