# Dealing with missing data

In [41]:
import pandas as pd 
import numpy as np

In [42]:
# Importing the data into a dataframe
df = pd.read_csv(
    'Datasets/Toyota.csv',
    index_col = 0,
    na_values=['NA', '??', '????']
)

df.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986.0,Diesel,90.0,1.0,0,2000,three,1165
1,13750,23.0,72937.0,Diesel,90.0,1.0,0,2000,3,1165
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
3,14950,26.0,48000.0,Diesel,90.0,0.0,0,2000,3,1165
4,13750,30.0,38500.0,Diesel,90.0,0.0,0,2000,3,1170


In [43]:
# Check for missing values
df.isnull().sum()

Price          0
Age          100
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [44]:
missing_values = df[df.isnull().any(axis=1)]
missing_values.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
6,16900,27.0,,Diesel,,,0,2000,3,1245
7,18600,30.0,75889.0,,90.0,1.0,0,2000,3,1245
9,12950,23.0,71138.0,Diesel,,,0,1900,3,1105
15,22000,28.0,18739.0,Petrol,,0.0,0,1800,3,1185


* fill in the missing values by mean/median.
* fill in the missing values with the class which has max count.

In [45]:
# Checking for missing values

df.describe()

Unnamed: 0,Price,Age,KM,HP,MetColor,Automatic,CC,Weight
count,1436.0,1336.0,1421.0,1430.0,1286.0,1436.0,1436.0,1436.0
mean,10730.824513,55.672156,68647.239972,101.478322,0.674961,0.05571,1566.827994,1072.45961
std,3626.964585,18.589804,37333.023589,14.768255,0.468572,0.229441,187.182436,52.64112
min,4350.0,1.0,1.0,69.0,0.0,0.0,1300.0,1000.0
25%,8450.0,43.0,43210.0,90.0,0.0,0.0,1400.0,1040.0
50%,9900.0,60.0,63634.0,110.0,1.0,0.0,1600.0,1070.0
75%,11950.0,70.0,87000.0,110.0,1.0,0.0,1600.0,1085.0
max,32500.0,80.0,243000.0,192.0,1.0,1.0,2000.0,1615.0


In [46]:
print(df['Age'].mean())
print(df['Age'].median())

55.67215568862275
60.0


In [52]:
# fill null values with the median vals
df['Age'].fillna(
    df['Age'].mean(),
    inplace=True
)

In [53]:
print(df['KM'].max())
print(df['KM'].min())
print(df['KM'].mean()) 
print(df['KM'].median())

df['KM'].fillna(
    df['KM'].median(),
    inplace=True
)

243000.0
1.0
68594.87325905293
63634.0


In [54]:
print(df['HP'].min())
print(df['HP'].max())
print(df['HP'].mean())
print(df['HP'].median())
df['HP'].fillna(
    df['HP'].mean(),
    inplace=True
)

69.0
192.0
101.47832167832168
110.0


In [55]:
df.isnull().sum()

Price          0
Age            0
KM             0
FuelType     100
HP             0
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [61]:
# Populate petrol as the defualt value for null or missing vals.
print(df['FuelType'].value_counts().index[0])
df['FuelType'].fillna(
    df['FuelType'].value_counts().index[0],
    inplace=True
)

Petrol


In [62]:
df.isnull().sum()

Price          0
Age            0
KM             0
FuelType       0
HP             0
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [68]:
# populate the missing values for metalic coll with the most frequent value
print(df.mode())

df['MetColor'].fillna(
    df['MetColor'].mode()[0],
    inplace=True
)


   Price        Age       KM FuelType     HP  MetColor  Automatic    CC Doors  \
0   8950  55.672156  63634.0   Petrol  110.0       1.0          0  1600     5   

   Weight  
0    1075  


In [69]:
df.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64