In [1]:
import pandas as pd 
import numpy as np 

In [2]:
nfl_data = pd.read_csv('Cars.csv')

In [3]:
nfl_data

Unnamed: 0,HP,MPG,VOL,SP,WT
0,49,53.700681,89,104.185353,28.762059
1,55,50.013401,92,105.461264,30.466833
2,55,50.013401,92,105.461264,30.193597
3,70,45.696322,92,113.461264,30.632114
4,53,50.504232,92,104.461264,29.889149
...,...,...,...,...,...
76,322,36.900000,50,169.598513,16.132947
77,238,19.197888,115,150.576579,37.923113
78,263,34.000000,50,151.598513,15.769625
79,295,19.833733,119,167.944460,39.423099


In [4]:
np.random.seed(0)

In [5]:
nfl_data.head()

Unnamed: 0,HP,MPG,VOL,SP,WT
0,49,53.700681,89,104.185353,28.762059
1,55,50.013401,92,105.461264,30.466833
2,55,50.013401,92,105.461264,30.193597
3,70,45.696322,92,113.461264,30.632114
4,53,50.504232,92,104.461264,29.889149


In [7]:
missing_values_count = nfl_data.isnull().sum()

In [8]:
missing_values_count

HP     0
MPG    0
VOL    0
SP     0
WT     0
dtype: int64

In [9]:
# no missing values

In [10]:
total_cells = np.product(nfl_data.shape)
total_missing = missing_values_count.sum()

In [11]:
total_cells

405

In [13]:
nfl_data.describe()

Unnamed: 0,HP,MPG,VOL,SP,WT
count,81.0,81.0,81.0,81.0,81.0
mean,117.469136,34.422076,98.765432,121.540272,32.412577
std,57.113502,9.131445,22.301497,14.181432,7.492813
min,49.0,12.101263,50.0,99.564907,15.712859
25%,84.0,27.856252,89.0,113.829145,29.591768
50%,100.0,35.152727,101.0,118.208698,32.734518
75%,140.0,39.531633,113.0,126.404312,37.392524
max,322.0,53.700681,160.0,169.598513,52.997752


In [14]:
nfl_data.count()

HP     81
MPG    81
VOL    81
SP     81
WT     81
dtype: int64

In [15]:
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

0.0


In [16]:
# because there is no null values

In [17]:
nfl_data.dropna() # remove all the rows that contain a missing value

Unnamed: 0,HP,MPG,VOL,SP,WT
0,49,53.700681,89,104.185353,28.762059
1,55,50.013401,92,105.461264,30.466833
2,55,50.013401,92,105.461264,30.193597
3,70,45.696322,92,113.461264,30.632114
4,53,50.504232,92,104.461264,29.889149
...,...,...,...,...,...
76,322,36.900000,50,169.598513,16.132947
77,238,19.197888,115,150.576579,37.923113
78,263,34.000000,50,151.598513,15.769625
79,295,19.833733,119,167.944460,39.423099


In [18]:
# remove all columns with at least one missing value
columns_with_na_dropped = nfl_data.dropna(axis=1)
columns_with_na_dropped.head()

Unnamed: 0,HP,MPG,VOL,SP,WT
0,49,53.700681,89,104.185353,28.762059
1,55,50.013401,92,105.461264,30.466833
2,55,50.013401,92,105.461264,30.193597
3,70,45.696322,92,113.461264,30.632114
4,53,50.504232,92,104.461264,29.889149


In [20]:
# just how much data did we lose?

In [23]:
print("Columns in original dataset : %d \n" % nfl_data.shape[1])
print("Columns with na dropped: %d " % columns_with_na_dropped.shape[1])

Columns in original dataset : 5 

Columns with na dropped: 5 


In [24]:
# in this case we dont have any na values

In [25]:
# filling in missing values autometically 

In [26]:
# get a small subset of nfl dataset
subset_nfl_data = nfl_data.loc[:, 'MPG':'SP']. head()
subset_nfl_data

Unnamed: 0,MPG,VOL,SP
0,53.700681,89,104.185353
1,50.013401,92,105.461264
2,50.013401,92,105.461264
3,45.696322,92,113.461264
4,50.504232,92,104.461264


In [27]:
# replace NA with 0
subset_nfl_data.fillna(0)

Unnamed: 0,MPG,VOL,SP
0,53.700681,89,104.185353
1,50.013401,92,105.461264
2,50.013401,92,105.461264
3,45.696322,92,113.461264
4,50.504232,92,104.461264


In [28]:
# replace all NA the value that comes directly after it in the same column,
# then replace all the remaining NA with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)

Unnamed: 0,MPG,VOL,SP
0,53.700681,89,104.185353
1,50.013401,92,105.461264
2,50.013401,92,105.461264
3,45.696322,92,113.461264
4,50.504232,92,104.461264
