# PART- 2 ( Handling Missing Values )

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

%matplotlib inline
# %matplotlib notebook

plt.rcParams["figure.figsize"] = (10,6)
# plt.rcParams['figure.dpi'] = 100

sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 150

In [2]:
df = pd.read_csv("clean_scout.csv")

In [3]:
df.shape

(15919, 33)

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
Make_Model,Audi A1,Audi A1,Audi A1,Audi A1,Audi A1
Body_Type,Sedans,Sedans,Sedans,Sedans,Sedans
Price,15770,14500,14640,14500,16790
Vat,VAT deductible,Price negotiable,VAT deductible,,
Km,56013.000,80000.000,83450.000,73000.000,16200.000
Hp,66.000,141.000,85.000,66.000,66.000
Type,Used,Used,Used,Used,Used
Warranty,,,,,
Cylinders,3.000,4.000,,3.000,3.000
Fuel,Diesel,Benzine,Diesel,Diesel,Diesel


In [5]:
(df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False)

Inspection_New        75.300
Warranty              69.514
Weight_kg             43.809
Drive_Chain           43.081
Previous_Owners       41.711
Paint_Type            36.259
Cylinders             35.681
Upholstery_Color      31.547
Gears                 29.600
Vat                   28.350
Upholstery_Type       28.287
Emission_Class        22.790
Extras                18.607
Co2_Emission          15.302
Cons_City             15.302
Cons_Country          14.926
Cons_Comb             12.771
Age                   10.032
Entertainment_Media    8.631
Km                     6.433
Safety_Security        6.169
Nr_Of_Seats            6.137
Comfort_Convenience    5.779
Body_Color             3.750
Displacement_cc        3.116
Nr_Of_Doors            1.332
Hp                     0.553
Body_Type              0.377
Type                   0.013
Gear_Type              0.000
Price                  0.000
Fuel                   0.000
Make_Model             0.000
dtype: float64

In [10]:
miss_val = []
[miss_val.append(i) for i in df.columns if df[i].isnull().any()]
miss_val

['Body_Type',
 'Vat',
 'Km',
 'Hp',
 'Type',
 'Warranty',
 'Cylinders',
 'Comfort_Convenience',
 'Entertainment_Media',
 'Extras',
 'Safety_Security',
 'Gears',
 'Age',
 'Previous_Owners',
 'Inspection_New',
 'Body_Color',
 'Paint_Type',
 'Upholstery_Type',
 'Upholstery_Color',
 'Nr_Of_Doors',
 'Nr_Of_Seats',
 'Displacement_cc',
 'Weight_kg',
 'Drive_Chain',
 'Cons_Comb',
 'Cons_City',
 'Cons_Country',
 'Co2_Emission',
 'Emission_Class']

In [11]:
df.nunique()

Make_Model                9
Body_Type                 9
Price                  2956
Vat                       2
Km                     6689
Hp                       80
Type                      5
Warranty                 41
Cylinders                 7
Fuel                      4
Comfort_Convenience    6198
Entertainment_Media     346
Extras                  659
Safety_Security        4443
Gears                    10
Age                       4
Previous_Owners           5
Inspection_New            1
Body_Color               14
Paint_Type                3
Upholstery_Type           6
Upholstery_Color         10
Nr_Of_Doors               6
Nr_Of_Seats               6
Gear_Type                 3
Displacement_cc          77
Weight_kg               434
Drive_Chain               3
Cons_Comb                72
Cons_City                86
Cons_Country             57
Co2_Emission            120
Emission_Class            3
dtype: int64

In [14]:
df.columns

Index(['Make_Model', 'Body_Type', 'Price', 'Vat', 'Km', 'Hp', 'Type',
       'Warranty', 'Cylinders', 'Fuel', 'Comfort_Convenience',
       'Entertainment_Media', 'Extras', 'Safety_Security', 'Gears', 'Age',
       'Previous_Owners', 'Inspection_New', 'Body_Color', 'Paint_Type',
       'Upholstery_Type', 'Upholstery_Color', 'Nr_Of_Doors', 'Nr_Of_Seats',
       'Gear_Type', 'Displacement_cc', 'Weight_kg', 'Drive_Chain', 'Cons_Comb',
       'Cons_City', 'Cons_Country', 'Co2_Emission', 'Emission_Class'],
      dtype='object')

In [15]:
df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

In [16]:
df.columns

Index(['make_model', 'body_type', 'price', 'vat', 'km', 'hp', 'type',
       'warranty', 'cylinders', 'fuel', 'comfort_convenience',
       'entertainment_media', 'extras', 'safety_security', 'gears', 'age',
       'previous_owners', 'inspection_new', 'body_color', 'paint_type',
       'upholstery_type', 'upholstery_color', 'nr_of_doors', 'nr_of_seats',
       'gear_type', 'displacement_cc', 'weight_kg', 'drive_chain', 'cons_comb',
       'cons_city', 'cons_country', 'co2_emission', 'emission_class'],
      dtype='object')

## Let's examine and fill the missing values of all the columns/features one by one

In [12]:
# function for first looking to the columns

def first_looking_col(col):
    print("column name    : ", col)
    print("--------------------------------")
    print("per_of_nulls   : ", "%", round(df[col].isnull().sum()/df.shape[0]*100, 2))
    print("num_of_nulls   : ", df[col].isnull().sum())
    print("num_of_uniques : ", df[col].nunique())
    print(df[col].value_counts(dropna = False))

## age

In [17]:
first_looking_col("age")

column name    :  age
--------------------------------
per_of_nulls   :  % 10.03
num_of_nulls   :  1597
num_of_uniques :  4
1.000    4522
3.000    3674
2.000    3273
0.000    2853
NaN      1597
Name: age, dtype: int64


In [23]:
df['age'].fillna('-', inplace=True)

In [24]:
df.groupby("age").km.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2706.0,2085.355,5365.881,1.0,10.0,50.0,3000.0,127022.0
1.0,4484.0,18035.239,11052.524,1.0,9990.0,17872.0,25078.5,136000.0
2.0,3272.0,41754.941,28295.748,1.0,21541.75,34752.0,54805.5,317000.0
3.0,3674.0,77442.521,39170.143,10.0,48000.0,72914.5,99950.0,291800.0
-,759.0,934.497,7416.244,0.0,5.0,10.0,10.0,89982.0


In [26]:
df[df["age"]=="-"]["km"].value_counts(dropna=False)

NaN          838
10.000       369
1.000        146
5.000         58
20.000        32
15.000        21
0.000         19
11.000        12
8.000         11
50.000        10
100.000        8
12.000         8
7.000          7
3.000          4
9.000          4
4.000          3
25.000         3
250.000        3
30.000         3
3000.000       2
22627.000      2
39962.000      2
2.000          2
19500.000      1
11000.000      1
85000.000      1
4307.000       1
89692.000      1
77.000         1
3500.000       1
68485.000      1
5000.000       1
141.000        1
150.000        1
34164.000      1
142.000        1
32084.000      1
81800.000      1
11200.000      1
20768.000      1
4500.000       1
40.000         1
784.000        1
89982.000      1
500.000        1
325.000        1
6100.000       1
196.000        1
6.000          1
60.000         1
497.000        1
99.000         1
281.000        1
Name: km, dtype: int64

In [27]:
df.loc[df['km'] < 10000, ["km", "age"]].sample(10)

Unnamed: 0,km,age
14809,849.0,0.000
8102,500.0,0.000
7095,5000.0,0.000
5428,50.0,0.000
14697,10.0,-
14745,5.0,0.000
135,7307.0,2.000
12782,5.0,-
2233,6000.0,0.000
5356,1.0,0.000


In [29]:
# Fill our nan values of age column; based on "km" value status

cond1 = (df['km'] < 10000)
cond2 = ((df['km'] >= 10000) & (df['km'] < 28000))
cond3 = ((df['km'] >= 28000) & (df['km'] < 50000))
cond4 = (df['km'] >= 50000)

In [30]:
df.loc[cond1,'age'] = df.loc[cond1,'age'].replace('-', 0)
df.loc[cond2,'age'] = df.loc[cond2,'age'].replace('-', 1)
df.loc[cond3,'age'] = df.loc[cond3,'age'].replace('-', 2)
df.loc[cond4,'age'] = df.loc[cond4,'age'].replace('-', 3)

In [31]:
df.groupby('age').km.mean()

age
0.0    1647.363
1.0   18035.130
2.0   41748.577
3.0   77450.063
-           NaN
Name: km, dtype: float64

In [32]:
# we used "km" column to fill nan values of "age" column, but we have nan values for "km" too 
df["km"].isnull().sum()  

1024

In [33]:
df["age"].value_counts(dropna=False)

1.0    4528
3.0    3679
0.0    3597
2.0    3277
-       838
Name: age, dtype: int64

In [34]:
# no missing value of "age" --> when we are talking about "km"
df.groupby(['make_model', 'age']).km.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
make_model,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Audi A1,0.0,569.0,2103.459,3258.85,0.0,10.0,100.0,3333.0,18000.0
Audi A1,1.0,744.0,13806.144,7975.693,1.0,7466.75,12413.0,20309.75,47000.0
Audi A1,2.0,432.0,25821.713,18608.364,10.0,14252.5,20730.5,32028.75,148257.0
Audi A1,3.0,629.0,54332.286,26281.269,3150.0,34914.0,50000.0,65500.0,192000.0
Audi A1,-,0.0,,,,,,,
Audi A2,1.0,1.0,26166.0,,26166.0,26166.0,26166.0,26166.0,26166.0
Audi A3,0.0,671.0,1517.817,6548.643,0.0,10.0,10.0,100.0,127022.0
Audi A3,1.0,776.0,18410.524,11054.42,1.0,11200.0,18000.0,24215.0,136000.0
Audi A3,2.0,675.0,43853.141,27349.704,15.0,25000.0,36677.0,55251.0,158000.0
Audi A3,3.0,818.0,90092.983,36464.156,35.0,62863.25,88000.0,112562.5,291800.0


In [35]:
# if we check price, we have missing values for "age"
df.groupby(['make_model',"body_type", 'age']).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,0.0,198.0,23277.434,3510.406,14900.0,20503.5,22492.0,26798.5,31990.0
Audi A1,Compact,1.0,268.0,18596.041,2659.91,13980.0,16445.0,16980.0,20950.0,23829.0
Audi A1,Compact,2.0,161.0,16602.807,2085.384,10999.0,15450.0,15850.0,17700.0,22150.0
Audi A1,Compact,3.0,234.0,14532.91,1908.909,9950.0,13407.5,13994.5,15480.0,18900.0
Audi A1,Compact,-,178.0,23996.264,3383.852,16220.0,21515.0,22875.0,27380.0,29181.0
Audi A1,Coupe,2.0,1.0,15900.0,,15900.0,15900.0,15900.0,15900.0,15900.0
Audi A1,Coupe,3.0,1.0,13950.0,,13950.0,13950.0,13950.0,13950.0,13950.0
Audi A1,Other,0.0,8.0,23826.25,2057.439,21490.0,22490.0,22720.0,25900.0,26900.0
Audi A1,Other,1.0,3.0,16796.667,178.979,16590.0,16745.0,16900.0,16900.0,16900.0
Audi A1,Other,2.0,1.0,23490.0,,23490.0,23490.0,23490.0,23490.0,23490.0


In [37]:
df['age'].replace('-',0, inplace=True)  # okay let's fill all missing values of "age" columns' as 0

In [38]:
df.groupby('age').km.mean()


age
0.000    1647.363
1.000   18035.130
2.000   41748.577
3.000   77450.063
Name: km, dtype: float64

In [39]:
df["age"].value_counts(dropna=False)

1.000    4528
0.000    4435
3.000    3679
2.000    3277
Name: age, dtype: int64

In [48]:
df.age.isnull().any()

False

## km

In [40]:
first_looking_col("km")

column name    :  km
--------------------------------
per_of_nulls   :  % 6.43
num_of_nulls   :  1024
num_of_uniques :  6689
10.000        1045
NaN           1024
1.000          367
5.000          170
50.000         148
              ... 
160542.000       1
20719.000        1
91910.000        1
39860.000        1
57889.000        1
Name: km, Length: 6690, dtype: int64


In [41]:
df.groupby("age").km.mean()

age
0.000    1647.363
1.000   18035.130
2.000   41748.577
3.000   77450.063
Name: km, dtype: float64

In [42]:
df.groupby("age").km.transform("mean").sample(10)  # transform all km values into mean as above

975     18035.130
8731    77450.063
13057   77450.063
15101   77450.063
7448    18035.130
8849    77450.063
455     77450.063
11681   41748.577
8821    77450.063
336     77450.063
Name: km, dtype: float64

In [44]:
df["km"].fillna(df.groupby("age").km.transform("mean"), inplace=True)

In [45]:
df.km.value_counts(dropna=False)

10.000        1045
1647.363       985
1.000          367
5.000          170
50.000         148
              ... 
160542.000       1
20719.000        1
91910.000        1
39860.000        1
57889.000        1
Name: km, Length: 6692, dtype: int64

In [47]:
df.km.isnull().any()

False

## body type

In [50]:
first_looking_col("body_type")

column name    :  body_type
--------------------------------
per_of_nulls   :  % 0.38
num_of_nulls   :  60
num_of_uniques :  9
Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64


In [52]:
df.body_type.replace("Other", np.nan, inplace=True)

In [53]:
df['body_type'].value_counts(dropna=False)

Sedans           7903
Station wagon    3553
Compact          3153
Van               783
NaN               350
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

In [54]:
df["body_type"].mode()

0    Sedans
dtype: object