# PART- 2 ( Handling Missing Values )

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

%matplotlib inline
# %matplotlib notebook

plt.rcParams["figure.figsize"] = (10,6)
# plt.rcParams['figure.dpi'] = 100

sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 150

In [2]:
df = pd.read_csv("clean_scout.csv")

In [3]:
df.shape

(15919, 33)

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
Make_Model,Audi A1,Audi A1,Audi A1,Audi A1,Audi A1
Body_Type,Sedans,Sedans,Sedans,Sedans,Sedans
Price,15770,14500,14640,14500,16790
Vat,VAT deductible,Price negotiable,VAT deductible,,
Km,56013.000,80000.000,83450.000,73000.000,16200.000
Hp,66.000,141.000,85.000,66.000,66.000
Type,Used,Used,Used,Used,Used
Warranty,,,,,
Cylinders,3.000,4.000,,3.000,3.000
Fuel,Diesel,Benzine,Diesel,Diesel,Diesel


In [5]:
(df.isnull().sum()/df.shape[0]*100).sort_values(ascending=False)

Inspection_New        75.300
Warranty              69.514
Weight_kg             43.809
Drive_Chain           43.081
Previous_Owners       41.711
Paint_Type            36.259
Cylinders             35.681
Upholstery_Color      31.547
Gears                 29.600
Vat                   28.350
Upholstery_Type       28.287
Emission_Class        22.790
Extras                18.607
Co2_Emission          15.302
Cons_City             15.302
Cons_Country          14.926
Cons_Comb             12.771
Age                   10.032
Entertainment_Media    8.631
Km                     6.433
Safety_Security        6.169
Nr_Of_Seats            6.137
Comfort_Convenience    5.779
Body_Color             3.750
Displacement_cc        3.116
Nr_Of_Doors            1.332
Hp                     0.553
Body_Type              0.377
Type                   0.013
Gear_Type              0.000
Price                  0.000
Fuel                   0.000
Make_Model             0.000
dtype: float64

In [6]:
miss_val = []
[miss_val.append(i) for i in df.columns if df[i].isnull().any()]
miss_val

['Body_Type',
 'Vat',
 'Km',
 'Hp',
 'Type',
 'Warranty',
 'Cylinders',
 'Comfort_Convenience',
 'Entertainment_Media',
 'Extras',
 'Safety_Security',
 'Gears',
 'Age',
 'Previous_Owners',
 'Inspection_New',
 'Body_Color',
 'Paint_Type',
 'Upholstery_Type',
 'Upholstery_Color',
 'Nr_Of_Doors',
 'Nr_Of_Seats',
 'Displacement_cc',
 'Weight_kg',
 'Drive_Chain',
 'Cons_Comb',
 'Cons_City',
 'Cons_Country',
 'Co2_Emission',
 'Emission_Class']

In [7]:
df.nunique()

Make_Model                9
Body_Type                 9
Price                  2956
Vat                       2
Km                     6689
Hp                       80
Type                      5
Warranty                 41
Cylinders                 7
Fuel                      4
Comfort_Convenience    6198
Entertainment_Media     346
Extras                  659
Safety_Security        4443
Gears                    10
Age                       4
Previous_Owners           5
Inspection_New            1
Body_Color               14
Paint_Type                3
Upholstery_Type           6
Upholstery_Color         10
Nr_Of_Doors               6
Nr_Of_Seats               6
Gear_Type                 3
Displacement_cc          77
Weight_kg               434
Drive_Chain               3
Cons_Comb                72
Cons_City                86
Cons_Country             57
Co2_Emission            120
Emission_Class            3
dtype: int64

In [8]:
df.columns

Index(['Make_Model', 'Body_Type', 'Price', 'Vat', 'Km', 'Hp', 'Type',
       'Warranty', 'Cylinders', 'Fuel', 'Comfort_Convenience',
       'Entertainment_Media', 'Extras', 'Safety_Security', 'Gears', 'Age',
       'Previous_Owners', 'Inspection_New', 'Body_Color', 'Paint_Type',
       'Upholstery_Type', 'Upholstery_Color', 'Nr_Of_Doors', 'Nr_Of_Seats',
       'Gear_Type', 'Displacement_cc', 'Weight_kg', 'Drive_Chain', 'Cons_Comb',
       'Cons_City', 'Cons_Country', 'Co2_Emission', 'Emission_Class'],
      dtype='object')

In [9]:
df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')

In [10]:
df.columns

Index(['make_model', 'body_type', 'price', 'vat', 'km', 'hp', 'type',
       'warranty', 'cylinders', 'fuel', 'comfort_convenience',
       'entertainment_media', 'extras', 'safety_security', 'gears', 'age',
       'previous_owners', 'inspection_new', 'body_color', 'paint_type',
       'upholstery_type', 'upholstery_color', 'nr_of_doors', 'nr_of_seats',
       'gear_type', 'displacement_cc', 'weight_kg', 'drive_chain', 'cons_comb',
       'cons_city', 'cons_country', 'co2_emission', 'emission_class'],
      dtype='object')

## Let's examine and fill the missing values of all the columns/features one by one

In [1]:
def fill_most(df, group_col, col_name):
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != []:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [2]:
def fill_prop(df, group_col, col_name):
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [11]:
def fill(df, group_col1, group_col2, col_name, method): # method can be "mode" or "median" or "ffill"
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])
                
    elif method == "median":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].median()).fillna(df[cond1][col_name].median()).fillna(df[col_name].median())
                
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [12]:
# function for first looking to the columns

def first_looking_col(col):
    print("column name    : ", col)
    print("--------------------------------")
    print("per_of_nulls   : ", "%", round(df[col].isnull().sum()/df.shape[0]*100, 2))
    print("num_of_nulls   : ", df[col].isnull().sum())
    print("num_of_uniques : ", df[col].nunique())
    print(df[col].value_counts(dropna = False))

## age

In [13]:
first_looking_col("age")

column name    :  age
--------------------------------
per_of_nulls   :  % 10.03
num_of_nulls   :  1597
num_of_uniques :  4
1.000    4522
3.000    3674
2.000    3273
0.000    2853
NaN      1597
Name: age, dtype: int64


In [14]:
df['age'].fillna('-', inplace=True)

In [15]:
df.groupby("age").km.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2706.0,2085.355,5365.881,1.0,10.0,50.0,3000.0,127022.0
1.0,4484.0,18035.239,11052.524,1.0,9990.0,17872.0,25078.5,136000.0
2.0,3272.0,41754.941,28295.748,1.0,21541.75,34752.0,54805.5,317000.0
3.0,3674.0,77442.521,39170.143,10.0,48000.0,72914.5,99950.0,291800.0
-,759.0,934.497,7416.244,0.0,5.0,10.0,10.0,89982.0


In [16]:
df[df["age"]=="-"]["km"].value_counts(dropna=False)

NaN          838
10.000       369
1.000        146
5.000         58
20.000        32
15.000        21
0.000         19
11.000        12
8.000         11
50.000        10
100.000        8
12.000         8
7.000          7
3.000          4
9.000          4
4.000          3
25.000         3
250.000        3
30.000         3
3000.000       2
22627.000      2
39962.000      2
2.000          2
19500.000      1
11000.000      1
85000.000      1
4307.000       1
89692.000      1
77.000         1
3500.000       1
68485.000      1
5000.000       1
141.000        1
150.000        1
34164.000      1
142.000        1
32084.000      1
81800.000      1
11200.000      1
20768.000      1
4500.000       1
40.000         1
784.000        1
89982.000      1
500.000        1
325.000        1
6100.000       1
196.000        1
6.000          1
60.000         1
497.000        1
99.000         1
281.000        1
Name: km, dtype: int64

In [17]:
df.loc[df['km'] < 10000, ["km", "age"]].sample(10)

Unnamed: 0,km,age
5007,251.0,0.000
8213,1443.0,0.000
2391,12.0,0.000
10421,50.0,0.000
12420,1500.0,0.000
2152,8000.0,0.000
745,8000.0,2.000
9684,1600.0,1.000
8165,10.0,-
8077,500.0,0.000


In [18]:
# Fill our nan values of age column; based on "km" value status

cond1 = (df['km'] < 10000)
cond2 = ((df['km'] >= 10000) & (df['km'] < 28000))
cond3 = ((df['km'] >= 28000) & (df['km'] < 50000))
cond4 = (df['km'] >= 50000)

In [19]:
df.loc[cond1,'age'] = df.loc[cond1,'age'].replace('-', 0)
df.loc[cond2,'age'] = df.loc[cond2,'age'].replace('-', 1)
df.loc[cond3,'age'] = df.loc[cond3,'age'].replace('-', 2)
df.loc[cond4,'age'] = df.loc[cond4,'age'].replace('-', 3)

In [20]:
df.groupby('age').km.mean()

age
0.0    1647.363
1.0   18035.130
2.0   41748.577
3.0   77450.063
-           NaN
Name: km, dtype: float64

In [21]:
# we used "km" column to fill nan values of "age" column, but we have nan values for "km" too 
df["km"].isnull().sum()  

1024

In [22]:
df["age"].value_counts(dropna=False)

1.0    4528
3.0    3679
0.0    3597
2.0    3277
-       838
Name: age, dtype: int64

In [23]:
# no missing value of "age" --> when we are talking about "km"
df.groupby(['make_model', 'age']).km.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
make_model,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Audi A1,0.0,569.0,2103.459,3258.85,0.0,10.0,100.0,3333.0,18000.0
Audi A1,1.0,744.0,13806.144,7975.693,1.0,7466.75,12413.0,20309.75,47000.0
Audi A1,2.0,432.0,25821.713,18608.364,10.0,14252.5,20730.5,32028.75,148257.0
Audi A1,3.0,629.0,54332.286,26281.269,3150.0,34914.0,50000.0,65500.0,192000.0
Audi A1,-,0.0,,,,,,,
Audi A2,1.0,1.0,26166.0,,26166.0,26166.0,26166.0,26166.0,26166.0
Audi A3,0.0,671.0,1517.817,6548.643,0.0,10.0,10.0,100.0,127022.0
Audi A3,1.0,776.0,18410.524,11054.42,1.0,11200.0,18000.0,24215.0,136000.0
Audi A3,2.0,675.0,43853.141,27349.704,15.0,25000.0,36677.0,55251.0,158000.0
Audi A3,3.0,818.0,90092.983,36464.156,35.0,62863.25,88000.0,112562.5,291800.0


In [24]:
# if we check price, we have missing values for "age"
df.groupby(['make_model',"body_type", 'age']).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,0.0,198.0,23277.434,3510.406,14900.0,20503.5,22492.0,26798.5,31990.0
Audi A1,Compact,1.0,268.0,18596.041,2659.91,13980.0,16445.0,16980.0,20950.0,23829.0
Audi A1,Compact,2.0,161.0,16602.807,2085.384,10999.0,15450.0,15850.0,17700.0,22150.0
Audi A1,Compact,3.0,234.0,14532.91,1908.909,9950.0,13407.5,13994.5,15480.0,18900.0
Audi A1,Compact,-,178.0,23996.264,3383.852,16220.0,21515.0,22875.0,27380.0,29181.0
Audi A1,Coupe,2.0,1.0,15900.0,,15900.0,15900.0,15900.0,15900.0,15900.0
Audi A1,Coupe,3.0,1.0,13950.0,,13950.0,13950.0,13950.0,13950.0,13950.0
Audi A1,Other,0.0,8.0,23826.25,2057.439,21490.0,22490.0,22720.0,25900.0,26900.0
Audi A1,Other,1.0,3.0,16796.667,178.979,16590.0,16745.0,16900.0,16900.0,16900.0
Audi A1,Other,2.0,1.0,23490.0,,23490.0,23490.0,23490.0,23490.0,23490.0


In [25]:
df['age'].replace('-',0, inplace=True)  # okay let's fill all missing values of "age" columns' as 0

In [26]:
df.groupby('age').km.mean()


age
0.000    1647.363
1.000   18035.130
2.000   41748.577
3.000   77450.063
Name: km, dtype: float64

In [27]:
df["age"].value_counts(dropna=False)

1.000    4528
0.000    4435
3.000    3679
2.000    3277
Name: age, dtype: int64

In [28]:
df.age.isnull().any()

False

## km

In [29]:
first_looking_col("km")

column name    :  km
--------------------------------
per_of_nulls   :  % 6.43
num_of_nulls   :  1024
num_of_uniques :  6689
10.000        1045
NaN           1024
1.000          367
5.000          170
50.000         148
              ... 
160542.000       1
20719.000        1
91910.000        1
39860.000        1
57889.000        1
Name: km, Length: 6690, dtype: int64


In [30]:
df.groupby("age").km.mean()

age
0.000    1647.363
1.000   18035.130
2.000   41748.577
3.000   77450.063
Name: km, dtype: float64

In [31]:
df.groupby("age").km.transform("mean").sample(10)  # transform all km values into mean as above

5159     1647.363
179     41748.577
377     77450.063
11012   77450.063
7368    18035.130
10802   41748.577
12147    1647.363
5567     1647.363
13295   41748.577
7907     1647.363
Name: km, dtype: float64

In [32]:
df["km"].fillna(df.groupby("age").km.transform("mean"), inplace=True)

In [33]:
df.km.value_counts(dropna=False)

10.000        1045
1647.363       985
1.000          367
5.000          170
50.000         148
              ... 
160542.000       1
20719.000        1
91910.000        1
39860.000        1
57889.000        1
Name: km, Length: 6692, dtype: int64

In [34]:
df.km.isnull().any()

False

## body type

In [35]:
first_looking_col("body_type")

column name    :  body_type
--------------------------------
per_of_nulls   :  % 0.38
num_of_nulls   :  60
num_of_uniques :  9
Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64


In [36]:
df.body_type.replace("Other", np.nan, inplace=True)

In [37]:
df['body_type'].value_counts(dropna=False)

Sedans           7903
Station wagon    3553
Compact          3153
Van               783
NaN               350
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

In [38]:
df["body_type"].mode()

0    Sedans
dtype: object

In [39]:
df["body_type"].mode()[0]

'Sedans'

In [40]:
list(df["make_model"].unique())

['Audi A1',
 'Audi A2',
 'Audi A3',
 'Opel Astra',
 'Opel Corsa',
 'Opel Insignia',
 'Renault Clio',
 'Renault Duster',
 'Renault Espace']

In [41]:
df[df["make_model"]=='Audi A1']["body_type"].mode()

0    Sedans
dtype: object

In [42]:
for make_model in list(df["make_model"].unique()):  # for unique make_model
    cond = df["make_model"]==make_model
    mode = list(df[cond]["body_type"].mode())  # make_model's most repeated body_type 
    if mode != []:  
        df.loc[cond, "body_type"] = df.loc[cond, "body_type"].fillna(df[cond]["body_type"].mode()[0])  # fill your body_type as most repeated body typefor make_model
    else:
        df.loc[cond, "body_type"] = df.loc[cond, "body_type"].fillna(df["body_type"].mode()[0])

In [43]:
df['body_type'].value_counts(dropna=False)

Sedans           8005
Station wagon    3678
Compact          3242
Van               817
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

## Previous_Owners


In [44]:
first_looking_col("previous_owners")

column name    :  previous_owners
--------------------------------
per_of_nulls   :  % 41.71
num_of_nulls   :  6640
num_of_uniques :  5
1.000    8294
NaN      6640
2.000     778
0.000     188
3.000      17
4.000       2
Name: previous_owners, dtype: int64


In [45]:
df["previous_owners"].fillna("-", inplace = True)

In [46]:
df["previous_owners"].value_counts(dropna=False)

1.0    8294
-      6640
2.0     778
0.0     188
3.0      17
4.0       2
Name: previous_owners, dtype: int64

In [47]:
df.groupby(['make_model', 'age', 'previous_owners']).km.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,age,previous_owners,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,0.0,0.0,45.0,958.365,815.362,0.0,10.0,1647.363,1647.363,1647.363
Audi A1,0.0,1.0,239.0,3069.543,3460.402,1.0,20.0,2500.0,5000.0,18000.0
Audi A1,0.0,2.0,1.0,3000.0,,3000.0,3000.0,3000.0,3000.0,3000.0
Audi A1,0.0,-,521.0,1549.992,2303.8,1.0,11.0,1647.363,1647.363,15500.0
Audi A1,1.0,0.0,1.0,15000.0,,15000.0,15000.0,15000.0,15000.0,15000.0
Audi A1,1.0,1.0,496.0,13772.192,8099.461,20.0,6898.0,11317.5,21054.0,35500.0
Audi A1,1.0,2.0,14.0,13734.286,10613.039,5000.0,8000.0,10500.0,13420.5,47000.0
Audi A1,1.0,-,236.0,13930.462,7544.057,1.0,9227.5,13437.0,19050.0,31877.0
Audi A1,2.0,0.0,1.0,68002.0,,68002.0,68002.0,68002.0,68002.0,68002.0
Audi A1,2.0,1.0,195.0,23858.369,19666.376,10.0,11659.5,18950.0,29968.5,148257.0


In [48]:
df[(df["make_model"]=="Renault Duster") & (df["previous_owners"] == "-")][["make_model", "previous_owners", "km"]]

Unnamed: 0,make_model,previous_owners,km
14894,Renault Duster,-,1647.363
14895,Renault Duster,-,1647.363
14896,Renault Duster,-,101.0
14897,Renault Duster,-,1647.363
14898,Renault Duster,-,101.0
14899,Renault Duster,-,101.0
14900,Renault Duster,-,1647.363
14901,Renault Duster,-,101.0
14903,Renault Duster,-,101.0
14904,Renault Duster,-,1647.363


In [49]:
cond = (df["make_model"]=="Renault Duster") & (df["previous_owners"] == "-")
df.loc[cond, "previous_owners"] = df.loc[cond, "previous_owners"].replace("-", 0.0)

In [50]:
df["previous_owners"].value_counts(dropna=False)

1.0    8294
-      6607
2.0     778
0.0     221
3.0      17
4.0       2
Name: previous_owners, dtype: int64

In [51]:
df["previous_owners"].replace("-", np.nan, inplace=True)

In [52]:
df["previous_owners"] = df["previous_owners"].fillna(method="ffill").fillna(method="bfill")

In [53]:
df["previous_owners"].value_counts(dropna=False)

1.000    14186
2.000     1176
0.000      528
3.000       27
4.000        2
Name: previous_owners, dtype: int64

## Warranty

In [54]:
first_looking_col("warranty")

column name    :  warranty
--------------------------------
per_of_nulls   :  % 69.51
num_of_nulls   :  11066
num_of_uniques :  41
NaN       11066
12.000     2594
24.000     1118
60.000      401
36.000      279
48.000      149
6.000       125
72.000       59
3.000        33
23.000       11
18.000       10
20.000        7
25.000        6
2.000         5
50.000        4
26.000        4
16.000        4
19.000        3
1.000         3
4.000         3
13.000        3
34.000        3
45.000        2
14.000        2
17.000        2
11.000        2
46.000        2
28.000        2
21.000        2
22.000        2
9.000         2
30.000        1
33.000        1
56.000        1
40.000        1
7.000         1
15.000        1
8.000         1
10.000        1
49.000        1
47.000        1
65.000        1
Name: warranty, dtype: int64


In [55]:
df["warranty"].fillna("-", inplace = True)

In [56]:
df.groupby(['make_model', 'age', 'warranty']).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,age,warranty,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,0.0,12.0,38.0,23846.342,2978.85,17900.0,21000.0,23650.0,25725.0,28990.0
Audi A1,0.0,18.0,1.0,19995.0,,19995.0,19995.0,19995.0,19995.0,19995.0
Audi A1,0.0,24.0,107.0,23766.308,3338.106,14900.0,21890.0,22900.0,26890.0,32000.0
Audi A1,0.0,36.0,26.0,25514.423,3205.836,19900.0,22489.25,26400.0,28677.5,29179.0
Audi A1,0.0,48.0,18.0,24441.667,2516.782,19650.0,22800.0,25300.0,26200.0,28500.0
Audi A1,0.0,56.0,1.0,21760.0,,21760.0,21760.0,21760.0,21760.0,21760.0
Audi A1,0.0,60.0,14.0,21829.857,717.879,20990.0,21340.0,21540.0,22105.0,22990.0
Audi A1,0.0,-,601.0,23854.852,3442.695,15980.0,21390.0,22900.0,26990.0,37900.0
Audi A1,1.0,6.0,9.0,22166.667,743.303,20900.0,22400.0,22400.0,22400.0,22900.0
Audi A1,1.0,12.0,64.0,18106.406,2408.814,14220.0,16429.75,16964.0,19912.75,23829.0


***There are too many nan values and when we analyzed these nan values according to the km, age and make_model columns, we decided that this column does not have healthy data.***

In [57]:
df.drop("warranty", axis=1, inplace=True)

## Vat

In [58]:
first_looking_col("vat")

column name    :  vat
--------------------------------
per_of_nulls   :  % 28.35
num_of_nulls   :  4513
num_of_uniques :  2
VAT deductible      10980
NaN                  4513
Price negotiable      426
Name: vat, dtype: int64


In [59]:
df.vat = df.vat.fillna(method="ffill").fillna(method="bfill")

In [60]:
df.vat.value_counts(dropna=False)

VAT deductible      15048
Price negotiable      871
Name: vat, dtype: int64

## Body_color

In [61]:
first_looking_col("body_color")

column name    :  body_color
--------------------------------
per_of_nulls   :  % 3.75
num_of_nulls   :  597
num_of_uniques :  14
Black     3745
Grey      3505
White     3406
Silver    1647
Blue      1431
Red        957
NaN        597
Brown      289
Green      154
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: body_color, dtype: int64


In [62]:
df["body_color"].fillna("-", inplace = True)

In [63]:
df["body_color"].value_counts(dropna=False)

Black     3745
Grey      3505
White     3406
Silver    1647
Blue      1431
Red        957
-          597
Brown      289
Green      154
Beige      108
Yellow      51
Violet      18
Bronze       6
Orange       3
Gold         2
Name: body_color, dtype: int64

In [64]:
df.groupby(["make_model", "body_type", 'body_color']).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,body_color,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,-,44.0,20291.841,3456.616,15100.0,17844.0,19944.5,22020.0,29181.0
Audi A1,Compact,Beige,6.0,20556.5,2475.512,16240.0,19766.75,21420.0,21700.0,23250.0
Audi A1,Compact,Black,320.0,18196.281,4206.973,9950.0,14990.0,16890.0,21390.0,28997.0
Audi A1,Compact,Blue,96.0,19145.406,4541.863,11444.0,15870.0,16925.0,22226.0,28980.0
Audi A1,Compact,Brown,9.0,16982.0,2964.388,11445.0,15993.0,16820.0,18850.0,20750.0
Audi A1,Compact,Green,17.0,23558.118,3849.697,19388.0,19388.0,22490.0,28240.0,28400.0
Audi A1,Compact,Grey,126.0,20292.627,4626.229,11800.0,16682.5,20700.0,22790.0,29190.0
Audi A1,Compact,Red,89.0,19306.888,4779.498,11990.0,15900.0,17940.0,21949.0,29150.0
Audi A1,Compact,Silver,42.0,17745.0,3369.815,11630.0,15299.25,16900.0,19975.0,23500.0
Audi A1,Compact,White,274.0,19193.102,4644.28,10490.0,15850.0,18288.0,22467.5,29197.0


In [65]:
df.drop("body_color", axis=1, inplace=True)

## Paint Type

In [66]:
first_looking_col("paint_type")

column name    :  paint_type
--------------------------------
per_of_nulls   :  % 36.26
num_of_nulls   :  5772
num_of_uniques :  3
Metallic       9794
NaN            5772
Uni/basic       347
Perl effect       6
Name: paint_type, dtype: int64


In [67]:
df["paint_type"].fillna("-", inplace = True)

In [68]:
df["paint_type"].value_counts(dropna=False)

Metallic       9794
-              5772
Uni/basic       347
Perl effect       6
Name: paint_type, dtype: int64

In [69]:
df.groupby(["make_model", "body_type", "age", 'paint_type']).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,paint_type,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,0.0,-,145.0,22082.717,3341.754,14900.0,19850.0,21490.0,23475.0,29181.0
Audi A1,Compact,0.0,Metallic,229.0,24622.258,3172.063,17880.0,21990.0,23700.0,27780.0,31990.0
Audi A1,Compact,0.0,Uni/basic,2.0,19888.0,0.0,19888.0,19888.0,19888.0,19888.0,19888.0
Audi A1,Compact,1.0,-,56.0,17590.518,2369.313,14220.0,15852.25,16864.5,18367.25,23650.0
Audi A1,Compact,1.0,Metallic,212.0,18861.651,2674.019,13980.0,16448.75,17515.0,21482.5,23829.0
Audi A1,Compact,2.0,-,52.0,15750.115,1218.358,12490.0,15810.0,15850.0,15850.0,21490.0
Audi A1,Compact,2.0,Metallic,108.0,17001.352,2294.777,10999.0,15450.0,15954.5,18957.5,22150.0
Audi A1,Compact,2.0,Uni/basic,1.0,17900.0,,17900.0,17900.0,17900.0,17900.0,17900.0
Audi A1,Compact,3.0,-,58.0,14007.948,1921.949,10490.0,12942.5,13820.0,15423.75,18400.0
Audi A1,Compact,3.0,Metallic,174.0,14723.218,1881.559,9950.0,13665.0,14360.0,15732.5,18900.0


In [70]:
df["paint_type"].replace("-", np.nan, inplace = True)

In [71]:
df.groupby(["make_model", "body_type", 'paint_type'])[["make_model", "body_type", 'paint_type']].head()

Unnamed: 0,make_model,body_type,paint_type
0,Audi A1,Sedans,Metallic
1,Audi A1,Sedans,
2,Audi A1,Sedans,Metallic
3,Audi A1,Sedans,Metallic
4,Audi A1,Sedans,Metallic
5,Audi A1,Sedans,Metallic
6,Audi A1,Station wagon,Metallic
7,Audi A1,Compact,Metallic
9,Audi A1,Sedans,
11,Audi A1,Sedans,Uni/basic


In [72]:
fill(df, "make_model", "body_type", "paint_type", "ffill")  # fill(df, group_col1, group_col2, col_name, method)

Number of NaN :  0
------------------
Metallic       15250
Uni/basic        637
Perl effect       32
Name: paint_type, dtype: int64


In [73]:
df.paint_type.value_counts(dropna=False)

Metallic       15250
Uni/basic        637
Perl effect       32
Name: paint_type, dtype: int64

## Type

In [74]:
first_looking_col("type")

column name    :  type
--------------------------------
per_of_nulls   :  % 0.01
num_of_nulls   :  2
num_of_uniques :  5
Used              11096
New                1650
Pre-registered     1364
Employee's car     1011
Demonstration       796
NaN                   2
Name: type, dtype: int64


In [75]:
df.type.fillna("-", inplace=True)

In [76]:
df.groupby(["type", "make_model", "age"]).km.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
type,make_model,age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
-,Audi A3,0.0,1.0,1647.363,,1647.363,1647.363,1647.363,1647.363,1647.363
-,Audi A3,3.0,1.0,115137.0,,115137.0,115137.0,115137.0,115137.0,115137.0
Demonstration,Audi A1,0.0,110.0,3784.227,2498.415,10.0,2541.25,3000.5,4999.75,12700.0
Demonstration,Audi A1,1.0,19.0,9931.158,5209.543,1050.0,5500.0,9522.0,14560.0,18900.0
Demonstration,Audi A1,2.0,6.0,22963.5,17802.677,11940.0,15327.25,16378.5,18425.0,59000.0
Demonstration,Audi A3,0.0,41.0,4971.878,3092.353,50.0,3000.0,5000.0,6000.0,14000.0
Demonstration,Audi A3,1.0,16.0,12801.188,10920.15,1015.0,4562.25,9850.0,21020.75,38400.0
Demonstration,Audi A3,2.0,5.0,27950.0,9145.354,16000.0,23000.0,27500.0,33750.0,39500.0
Demonstration,Opel Astra,0.0,148.0,2893.376,2879.037,3.0,315.25,1773.681,4999.0,13542.0
Demonstration,Opel Astra,1.0,16.0,7319.938,7499.404,100.0,2160.5,2499.0,13514.75,22700.0


In [77]:
cond1 = (df['make_model'] == "Audi A3") & (df["age"] == 0)
cond2 = (df['make_model'] == "Audi A3") & (df["age"] == 3)

In [78]:
df.loc[cond1,'type'] = df.loc[cond1,'type'].replace('-','New')
df.loc[cond2,'type'] = df.loc[cond2,'type'].replace('-','Used')

In [79]:
df['type'].value_counts(dropna=False)

Used              11097
New                1651
Pre-registered     1364
Employee's car     1011
Demonstration       796
Name: type, dtype: int64

## Inspection new

In [80]:
first_looking_col("inspection_new")

column name    :  inspection_new
--------------------------------
per_of_nulls   :  % 75.3
num_of_nulls   :  11987
num_of_uniques :  1
NaN    11987
Yes     3932
Name: inspection_new, dtype: int64


In [81]:
df["inspection_new"].fillna("-", inplace=True)

In [82]:
df["inspection_new"].value_counts(dropna=False)

-      11987
Yes     3932
Name: inspection_new, dtype: int64

In [83]:
df.groupby(["make_model", "body_type", "age", "inspection_new"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,inspection_new,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,0.0,-,243.0,24206.37,3530.095,16220.0,21435.0,23480.0,27390.0,31990.0
Audi A1,Compact,0.0,Yes,133.0,22542.248,3076.119,14900.0,20388.0,22400.0,22950.0,29197.0
Audi A1,Compact,1.0,-,150.0,19065.307,2836.071,14500.0,16490.0,18800.0,21949.75,23777.0
Audi A1,Compact,1.0,Yes,118.0,17999.517,2293.776,13980.0,16445.0,16910.0,19963.5,23829.0
Audi A1,Compact,2.0,-,115.0,16425.67,1985.79,10999.0,15450.0,15850.0,16890.0,22150.0
Audi A1,Compact,2.0,Yes,46.0,17045.652,2278.592,12490.0,15610.0,15980.0,18367.0,21490.0
Audi A1,Compact,3.0,-,128.0,14745.82,1877.293,10900.0,13500.0,14940.0,15492.5,18900.0
Audi A1,Compact,3.0,Yes,106.0,14275.811,1923.911,9950.0,12922.5,13925.0,15443.75,18880.0
Audi A1,Coupe,2.0,-,1.0,15900.0,,15900.0,15900.0,15900.0,15900.0,15900.0
Audi A1,Coupe,3.0,-,1.0,13950.0,,13950.0,13950.0,13950.0,13950.0,13950.0


In [84]:
df["inspection_new"].replace("-", "No", inplace=True)

In [85]:
df["inspection_new"].value_counts(dropna=False)

No     11987
Yes     3932
Name: inspection_new, dtype: int64

In [86]:
df["inspection_new"].replace(["Yes", "No"], [1,0], inplace = True)

In [87]:
df["inspection_new"].value_counts(dropna=False)

0    11987
1     3932
Name: inspection_new, dtype: int64

## Upholstery_type

In [88]:
first_looking_col("upholstery_type")

column name    :  upholstery_type
--------------------------------
per_of_nulls   :  % 28.29
num_of_nulls   :  4503
num_of_uniques :  6
Cloth           8423
NaN             4503
Part leather    1499
Full leather    1009
Other            368
Velour            60
alcantara         57
Name: upholstery_type, dtype: int64


In [89]:
df["upholstery_type"].replace(["Velour", "alcantara", "Part leather", "Full leather","Other"], ["Cloth", "Part/Full Leather", "Part/Full Leather", "Part/Full Leather",np.nan], inplace=True)

In [90]:
df["upholstery_type"].value_counts(dropna=False)

Cloth                8483
NaN                  4871
Part/Full Leather    2565
Name: upholstery_type, dtype: int64

In [91]:
df.groupby(["make_model", "body_type", "upholstery_type"])["make_model", "body_type", "upholstery_type"].head()

Unnamed: 0,make_model,body_type,upholstery_type
0,Audi A1,Sedans,Cloth
1,Audi A1,Sedans,Cloth
2,Audi A1,Sedans,Cloth
3,Audi A1,Sedans,
4,Audi A1,Sedans,Cloth
5,Audi A1,Sedans,Part/Full Leather
6,Audi A1,Station wagon,Part/Full Leather
7,Audi A1,Compact,Cloth
8,Audi A1,Sedans,Cloth
9,Audi A1,Sedans,


In [92]:
fill(df, "make_model", "body_type", "upholstery_type", "ffill")

Number of NaN :  0
------------------
Cloth                12238
Part/Full Leather     3681
Name: upholstery_type, dtype: int64


## Upholstery_color

In [95]:
first_looking_col("upholstery_color")

column name    :  upholstery_color
--------------------------------
per_of_nulls   :  % 31.55
num_of_nulls   :  5022
num_of_uniques :  10
Black     8201
NaN       5022
Grey      1376
Other     1016
Brown      207
Beige       54
Blue        16
White       13
Red          9
Yellow       4
Orange       1
Name: upholstery_color, dtype: int64


In [96]:
df.drop("upholstery_color", axis=1, inplace=True) #it's unnecessary

## Nr. of Doors

In [97]:
first_looking_col("nr_of_doors")

column name    :  nr_of_doors
--------------------------------
per_of_nulls   :  % 1.33
num_of_nulls   :  212
num_of_uniques :  6
5.000    11575
4.000     3079
3.000      832
2.000      219
NaN        212
7.000        1
1.000        1
Name: nr_of_doors, dtype: int64


In [98]:
df.groupby(["make_model", "body_type", "nr_of_doors"])[["make_model", "body_type", "nr_of_doors"]].head()

Unnamed: 0,make_model,body_type,nr_of_doors
0,Audi A1,Sedans,5.0
1,Audi A1,Sedans,3.0
2,Audi A1,Sedans,4.0
3,Audi A1,Sedans,3.0
4,Audi A1,Sedans,5.0
5,Audi A1,Sedans,4.0
6,Audi A1,Station wagon,4.0
7,Audi A1,Compact,5.0
8,Audi A1,Sedans,5.0
9,Audi A1,Sedans,5.0


In [99]:
fill(df, "make_model", "body_type", "nr_of_doors", "mode") 

Number of NaN :  0
------------------
5.000    11787
4.000     3079
3.000      832
2.000      219
7.000        1
1.000        1
Name: nr_of_doors, dtype: int64


## Nr. of Seats

In [101]:
first_looking_col("nr_of_seats")

column name    :  nr_of_seats
--------------------------------
per_of_nulls   :  % 6.14
num_of_nulls   :  977
num_of_uniques :  6
5.000    13336
4.000     1125
NaN        977
7.000      362
2.000      116
6.000        2
3.000        1
Name: nr_of_seats, dtype: int64


In [102]:
fill(df, "make_model", "body_type", "nr_of_seats", "mode") 

Number of NaN :  0
------------------
5.000    14308
4.000     1127
7.000      362
2.000      119
6.000        2
3.000        1
Name: nr_of_seats, dtype: int64


## Cylinders

In [103]:
first_looking_col("cylinders")

column name    :  cylinders
--------------------------------
per_of_nulls   :  % 35.68
num_of_nulls   :  5680
num_of_uniques :  7
4.000    8105
NaN      5680
3.000    2104
5.000      22
6.000       3
8.000       2
2.000       2
1.000       1
Name: cylinders, dtype: int64


In [104]:
fill(df, "make_model", "body_type", "cylinders", "mode") 

Number of NaN :  0
------------------
4.000    12926
3.000     2963
5.000       22
6.000        3
8.000        2
2.000        2
1.000        1
Name: cylinders, dtype: int64


In [105]:
df.drop("cylinders", axis = 1, inplace = True)  # will not give us good insights

## Drive chain

In [106]:
first_looking_col("drive_chain")

column name    :  drive_chain
--------------------------------
per_of_nulls   :  % 43.08
num_of_nulls   :  6858
num_of_uniques :  3
front    8886
NaN      6858
4WD       171
rear        4
Name: drive_chain, dtype: int64


In [107]:
df["drive_chain"].fillna("-", inplace=True)

In [108]:
df.groupby(["make_model", "body_type", "drive_chain"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,drive_chain,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,-,352.0,17620.869,4226.116,10490.0,14990.0,15900.0,20885.75,29190.0
Audi A1,Compact,4WD,2.0,14790.0,1258.65,13900.0,14345.0,14790.0,15235.0,15680.0
Audi A1,Compact,front,685.0,20008.223,4511.348,9950.0,16430.0,19890.0,22690.0,31990.0
Audi A1,Coupe,-,2.0,14925.0,1378.858,13950.0,14437.5,14925.0,15412.5,15900.0
Audi A1,Sedans,-,561.0,17830.44,4362.321,8999.0,14900.0,16490.0,20700.0,37900.0
Audi A1,Sedans,4WD,1.0,15450.0,,15450.0,15450.0,15450.0,15450.0,15450.0
Audi A1,Sedans,front,989.0,19133.794,4441.969,10000.0,15838.0,18500.0,21999.0,32000.0
Audi A1,Station wagon,-,3.0,24593.0,7537.216,15890.0,22390.0,28890.0,28944.5,28999.0
Audi A1,Station wagon,front,18.0,16681.111,2493.673,12950.0,15000.0,16356.0,17300.0,21450.0
Audi A1,Van,front,1.0,29000.0,,29000.0,29000.0,29000.0,29000.0,29000.0


In [109]:
cond = (df['make_model'] == "Renault Duster") & (df["body_type"] == "Off-Road")  # it could be 4WD

In [110]:
df.loc[cond,'drive_chain'] = df.loc[cond,'drive_chain'].replace('-','4WD')

In [111]:
df["drive_chain"].value_counts(dropna=False)

front    8886
-        6826
4WD       203
rear        4
Name: drive_chain, dtype: int64

In [112]:
df["drive_chain"] = df["drive_chain"].replace('-', np.nan)

In [113]:
df["drive_chain"].value_counts(dropna=False)

front    8886
NaN      6826
4WD       203
rear        4
Name: drive_chain, dtype: int64

In [114]:
fill(df, "make_model", "body_type", "drive_chain", "mode")  # all NaN to "front"

Number of NaN :  0
------------------
front    15711
4WD        204
rear         4
Name: drive_chain, dtype: int64


## Emission Class

In [115]:
first_looking_col("emission_class")

column name    :  emission_class
--------------------------------
per_of_nulls   :  % 22.79
num_of_nulls   :  3628
num_of_uniques :  3
Euro 6    12173
NaN        3628
Euro 5       78
Euro 4       40
Name: emission_class, dtype: int64


In [116]:
df["emission_class"].fillna("-", inplace=True)

In [117]:
df["emission_class"].value_counts(dropna=False)

Euro 6    12173
-          3628
Euro 5       78
Euro 4       40
Name: emission_class, dtype: int64

In [118]:
df.groupby(["make_model", "age", "fuel", "emission_class"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,age,fuel,emission_class,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,0.0,Benzine,-,267.0,23770.318,3136.485,14900.0,21875.0,22900.0,26330.0,31990.0
Audi A1,0.0,Benzine,Euro 6,535.0,23948.533,3472.575,15550.0,21285.0,22900.0,27107.5,37900.0
Audi A1,0.0,Diesel,Euro 6,4.0,19370.0,1181.891,17900.0,18717.5,19495.0,20147.5,20590.0
Audi A1,1.0,Benzine,-,99.0,19317.061,2855.338,14490.0,16684.5,19887.0,21900.0,23880.0
Audi A1,1.0,Benzine,Euro 5,3.0,17862.667,1840.593,16800.0,16800.0,16800.0,18394.0,19988.0
Audi A1,1.0,Benzine,Euro 6,334.0,18793.06,2829.853,13450.0,16445.75,18333.0,21356.25,33900.0
Audi A1,1.0,Diesel,-,15.0,18789.8,2474.874,15500.0,16675.0,18200.0,20789.5,22900.0
Audi A1,1.0,Diesel,Euro 5,1.0,16800.0,,16800.0,16800.0,16800.0,16800.0,16800.0
Audi A1,1.0,Diesel,Euro 6,295.0,17755.217,2063.238,14900.0,16295.0,16890.0,18900.0,23700.0
Audi A1,2.0,Benzine,-,57.0,17858.07,2525.66,14500.0,15450.0,17900.0,20490.0,23490.0


In [119]:
df["emission_class"].replace("-", np.nan, inplace=True)

In [120]:
df["emission_class"].value_counts(dropna=False)

Euro 6    12173
NaN        3628
Euro 5       78
Euro 4       40
Name: emission_class, dtype: int64

In [123]:
df.groupby(["make_model","fuel","emission_class"])["make_model","fuel","emission_class"].head()

Unnamed: 0,make_model,fuel,emission_class
0,Audi A1,Diesel,Euro 6
1,Audi A1,Benzine,Euro 6
2,Audi A1,Diesel,Euro 6
3,Audi A1,Diesel,Euro 6
4,Audi A1,Diesel,Euro 6
5,Audi A1,Diesel,Euro 6
9,Audi A1,Benzine,Euro 6
11,Audi A1,Benzine,Euro 6
12,Audi A1,Benzine,Euro 6
13,Audi A1,Benzine,Euro 6


In [122]:
fill(df, "make_model", "fuel", "emission_class", "ffill")

Number of NaN :  0
------------------
Euro 6    15773
Euro 5      100
Euro 4       46
Name: emission_class, dtype: int64


In [124]:
df.drop("emission_class", axis=1, inplace=True) # drop it because most of cars euro 6

## Gears

In [125]:
first_looking_col("gears")

column name    :  gears
--------------------------------
per_of_nulls   :  % 29.6
num_of_nulls   :  4712
num_of_uniques :  10
6.000     5822
NaN       4712
5.000     3239
7.000     1908
8.000      224
9.000        6
4.000        2
1.000        2
3.000        2
50.000       1
2.000        1
Name: gears, dtype: int64


In [126]:
df["gears"].fillna("-", inplace=True)

In [127]:
df["gears"].value_counts(dropna=False)

6.0     5822
-       4712
5.0     3239
7.0     1908
8.0      224
9.0        6
1.0        2
3.0        2
4.0        2
2.0        1
50.0       1
Name: gears, dtype: int64

In [129]:
df.groupby(["make_model", "body_type", "gear_type", "gears"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,gear_type,gears,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Audi A1,Compact,Automatic,5.0,3.0,22184.333,3421.302,18497.0,20648.5,22800.0,24028.0,25256.0
Audi A1,Compact,Automatic,6.0,6.0,21038.333,4282.039,16430.0,18725.0,20920.0,21060.0,28860.0
Audi A1,Compact,Automatic,7.0,199.0,22059.251,3918.116,13990.0,18970.0,21790.0,24365.0,29181.0
Audi A1,Compact,Automatic,8.0,1.0,16880.0,,16880.0,16880.0,16880.0,16880.0,16880.0
Audi A1,Compact,Automatic,-,253.0,21640.427,4965.5,13880.0,16975.0,20950.0,26980.0,29197.0
Audi A1,Compact,Manual,5.0,277.0,16329.469,3040.933,9950.0,13990.0,15900.0,16940.0,22990.0
Audi A1,Compact,Manual,6.0,77.0,20538.299,2061.004,12550.0,19588.0,20881.0,21990.0,22989.0
Audi A1,Compact,Manual,-,220.0,16756.727,3182.991,10490.0,14430.0,15880.0,19032.5,22990.0
Audi A1,Compact,Semi-automatic,7.0,3.0,24028.333,7208.44,17945.0,20047.5,22150.0,27070.0,31990.0
Audi A1,Coupe,Manual,5.0,1.0,13950.0,,13950.0,13950.0,13950.0,13950.0,13950.0


In [130]:
df["gears"].replace([1,2,3,4,9,50,"-"], np.nan, inplace=True)  # most rare value_counts

In [132]:
for group1 in list(df["make_model"].unique()):
    for group2 in list(df["body_type"].unique()):
        for group3 in list(df["gear_type"].unique()):
            cond1 = df["make_model"]==group1
            cond2 = (df["make_model"]==group1) & (df["body_type"]==group2)
            cond3 = (df["make_model"]==group1) & (df["body_type"]==group2) & (df["gear_type"]==group3)
            mode1 = list(df[cond1]["gears"].mode())
            mode2 = list(df[cond2]["gears"].mode())
            mode3 = list(df[cond3]["gears"].mode())
            if mode3 != []:
                df.loc[cond3, "gears"] = df.loc[cond3, "gears"].fillna(df[cond3]["gears"].mode()[0])
            elif mode2 != []:
                df.loc[cond3, "gears"] = df.loc[cond3, "gears"].fillna(df[cond2]["gears"].mode()[0])
            elif mode1 != []:
                df.loc[cond3, "gears"] = df.loc[cond3, "gears"].fillna(df[cond1]["gears"].mode()[0])
            else:
                df.loc[cond3, "gears"] = df.loc[cond3, "gears"].fillna(df["gears"].mode()[0])

In [134]:
df["gears"].value_counts(dropna=False)

6.000    8626
5.000    4258
7.000    2810
8.000     225
Name: gears, dtype: int64

## Hp

In [136]:
first_looking_col("hp")

column name    :  hp
--------------------------------
per_of_nulls   :  % 0.55
num_of_nulls   :  88
num_of_uniques :  80
85.000     2542
66.000     2122
81.000     1402
100.000    1308
110.000    1112
70.000      888
125.000     707
51.000      695
55.000      569
118.000     516
92.000      466
121.000     392
147.000     380
77.000      345
56.000      286
54.000      276
103.000     253
87.000      232
165.000     194
88.000      177
60.000      160
162.000      98
NaN          88
74.000       81
96.000       72
71.000       59
101.000      47
67.000       40
154.000      39
122.000      35
119.000      30
164.000      27
135.000      24
82.000       22
52.000       22
1.000        20
78.000       20
146.000      18
294.000      18
141.000      16
57.000       10
120.000       8
104.000       8
191.000       7
112.000       7
155.000       6
117.000       6
184.000       5
90.000        4
76.000        4
65.000        4
149.000       3
98.000        3
93.000        3
80.000        3

In [137]:
df["hp"].fillna("-", inplace=True)

In [138]:
df.groupby(["make_model","body_type","hp"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,hp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,60.0,58.0,15189.828,1663.611,10900.0,14390.0,15774.5,16345.0,16978.0
Audi A1,Compact,66.0,162.0,15398.21,1934.301,10490.0,14042.5,15465.0,16445.0,23700.0
Audi A1,Compact,70.0,332.0,17983.262,3315.814,9950.0,15480.0,17447.0,21190.0,28990.0
Audi A1,Compact,71.0,32.0,20831.594,2410.84,15890.0,18937.5,21425.0,22462.25,25256.0
Audi A1,Compact,85.0,330.0,22604.103,4572.568,11100.0,19700.0,22497.0,26980.0,31990.0
Audi A1,Compact,86.0,1.0,14295.0,,14295.0,14295.0,14295.0,14295.0,14295.0
Audi A1,Compact,92.0,90.0,18029.056,3027.431,12550.0,15850.0,16935.0,20624.25,28880.0
Audi A1,Compact,93.0,2.0,21447.5,774.282,20900.0,21173.75,21447.5,21721.25,21995.0
Audi A1,Compact,110.0,20.0,23620.45,3777.344,15490.0,21299.5,23325.0,26932.5,28980.0
Audi A1,Compact,141.0,2.0,22495.0,841.457,21900.0,22197.5,22495.0,22792.5,23090.0


In [139]:
df["hp"].replace("-", np.nan, inplace=True)

In [140]:
fill(df, "make_model", "body_type", "hp", "mode")

Number of NaN :  0
------------------
85.000     2543
66.000     2124
81.000     1403
100.000    1314
110.000    1113
70.000      890
125.000     711
51.000      696
55.000      589
118.000     550
92.000      466
121.000     392
147.000     380
77.000      353
56.000      294
54.000      276
103.000     253
87.000      232
165.000     194
88.000      177
60.000      160
162.000      98
74.000       81
96.000       72
71.000       59
101.000      47
67.000       40
154.000      39
122.000      35
119.000      30
164.000      27
135.000      24
52.000       22
82.000       22
78.000       20
1.000        20
146.000      18
294.000      18
141.000      16
57.000       10
120.000       8
104.000       8
191.000       7
112.000       7
155.000       6
117.000       6
184.000       5
76.000        4
65.000        4
90.000        4
80.000        3
98.000        3
168.000       3
93.000        3
149.000       3
150.000       2
89.000        2
63.000        2
86.000        2
53.000        2
27

## Displacement_cc

In [143]:
first_looking_col("displacement_cc")

column name    :  displacement_cc
--------------------------------
per_of_nulls   :  % 3.12
num_of_nulls   :  496
num_of_uniques :  77
1598.000     4761
999.000      2438
1398.000     1314
1399.000      749
1229.000      677
1956.000      670
1461.000      595
1490.000      559
NaN           496
1422.000      467
1197.000      353
898.000       351
1395.000      320
1968.000      301
1149.000      288
1618.000      212
1798.000      210
1498.000      196
1600.000      130
1248.000      110
1997.000      103
1364.000      102
1400.000       90
998.000        72
1500.000       50
2000.000       46
1000.000       40
1.000          36
1998.000       25
2480.000       20
1984.000       18
1200.000       18
899.000        11
1397.000       11
160.000         6
1499.000        5
929.000         5
139.000         4
900.000         4
1596.000        4
997.000         4
1199.000        3
1599.000        3
1396.000        3
1495.000        2
1300.000        2
1589.000        2
2.000           2
9

In [145]:
df["displacement_cc"].fillna("-", inplace=True)

In [146]:
df.groupby(["make_model", "body_type","displacement_cc"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,displacement_cc,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,929.0,5.0,16364.0,1504.819,14980.0,15480.0,16240.0,16240.0,18880.0
Audi A1,Compact,995.0,2.0,21990.0,1414.214,20990.0,21490.0,21990.0,22490.0,22990.0
Audi A1,Compact,999.0,663.0,20294.101,4570.798,9950.0,16480.0,20380.0,22820.0,31990.0
Audi A1,Compact,1000.0,7.0,19498.429,5396.196,13500.0,16024.5,16800.0,22695.0,28750.0
Audi A1,Compact,1395.0,98.0,18246.622,3084.081,12550.0,15850.0,17835.0,20900.0,28880.0
Audi A1,Compact,1422.0,158.0,15396.424,1943.65,10490.0,14035.0,15465.0,16445.0,23700.0
Audi A1,Compact,1498.0,14.0,24982.143,3186.063,20582.0,22453.0,24382.5,28380.0,28980.0
Audi A1,Compact,1596.0,1.0,15585.0,,15585.0,15585.0,15585.0,15585.0,15585.0
Audi A1,Compact,1598.0,68.0,16568.309,3059.451,11100.0,14271.25,15900.0,18263.0,23500.0
Audi A1,Compact,1600.0,1.0,16800.0,,16800.0,16800.0,16800.0,16800.0,16800.0


In [147]:
df["displacement_cc"].replace("-", np.nan, inplace=True)

In [148]:
fill(df, "make_model", "body_type", "displacement_cc", "mode")

Number of NaN :  0
------------------
1598.000     5044
999.000      2467
1398.000     1387
1399.000      768
1229.000      678
1956.000      670
1461.000      667
1490.000      559
1422.000      467
1197.000      372
898.000       351
1395.000      320
1968.000      301
1149.000      288
1618.000      212
1798.000      210
1498.000      196
1600.000      130
1248.000      110
1997.000      103
1364.000      102
1400.000       90
998.000        72
1500.000       50
2000.000       46
1000.000       40
1.000          36
1998.000       25
2480.000       20
1984.000       18
1200.000       18
1397.000       11
899.000        11
160.000         6
929.000         5
1499.000        5
139.000         4
1596.000        4
997.000         4
900.000         4
1396.000        3
1599.000        3
1199.000        3
1589.000        2
2.000           2
995.000         2
1300.000        2
1495.000        2
15898.000       1
1568.000        1
1368.000        1
890.000         1
1239.000        1
1496.000

## Weight_kg

In [151]:
first_looking_col("weight_kg")

column name    :  weight_kg
--------------------------------
per_of_nulls   :  % 43.81
num_of_nulls   :  6974
num_of_uniques :  434
NaN         6974
1163.000     574
1360.000     356
1165.000     301
1335.000     242
1135.000     213
1199.000     205
1734.000     170
1180.000     168
1503.000     165
1350.000     155
1355.000     135
1260.000     127
1275.000     112
1278.000     110
1425.000     109
1487.000     109
1255.000     108
1200.000     107
1522.000     103
1273.000     103
1280.000     102
1403.000      91
1120.000      90
1195.000      89
1659.000      89
1701.000      87
1250.000      84
1441.000      82
1308.000      80
1285.000      80
1110.000      75
1613.000      75
1279.000      72
1364.000      70
1345.000      67
1733.000      65
1685.000      64
1071.000      64
1325.000      64
1141.000      64
1230.000      63
1845.000      56
1090.000      54
1052.000      53
1664.000      52
1154.000      52
1513.000      51
1065.000      50
1237.000      49
1440.000      46
1

In [152]:
df["weight_kg"].fillna("-", inplace=True)

In [153]:
df.groupby(["make_model", "body_type","weight_kg"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,weight_kg,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,102.0,1.0,19229.0,,19229.0,19229.0,19229.0,19229.0,19229.0
Audi A1,Compact,1010.0,2.0,15450.0,707.107,14950.0,15200.0,15450.0,15700.0,15950.0
Audi A1,Compact,1035.0,6.0,16796.667,2617.867,14390.0,15892.5,15900.0,16575.0,21900.0
Audi A1,Compact,1040.0,2.0,20424.5,2933.786,18350.0,19387.25,20424.5,21461.75,22499.0
Audi A1,Compact,1065.0,36.0,20971.778,1982.554,15500.0,18987.5,21690.0,22400.0,23550.0
Audi A1,Compact,1090.0,9.0,21059.222,2945.233,15998.0,18990.0,22450.0,22800.0,25256.0
Audi A1,Compact,1100.0,2.0,27525.0,1732.412,26300.0,26912.5,27525.0,28137.5,28750.0
Audi A1,Compact,1105.0,12.0,13629.167,228.093,13400.0,13400.0,13625.0,13825.0,13900.0
Audi A1,Compact,1110.0,22.0,14322.0,1738.097,9950.0,13116.75,14559.5,15430.0,16879.0
Audi A1,Compact,1115.0,3.0,20833.333,1950.855,18850.0,19875.0,20900.0,21825.0,22750.0


In [154]:
df["weight_kg"].replace("-", np.nan, inplace=True)

In [155]:
fill(df, "make_model", "body_type", "weight_kg", "mode")

Number of NaN :  0
------------------
1163.000    1582
1360.000    1419
1487.000     966
1135.000     837
1425.000     744
1180.000     694
1273.000     656
1165.000     603
1503.000     561
1734.000     556
1087.000     291
1335.000     242
1365.000     211
1199.000     205
1350.000     156
1119.000     153
1355.000     136
1280.000     127
1260.000     127
1275.000     112
1278.000     110
1255.000     108
1200.000     107
1522.000     103
1659.000     102
1195.000      96
1120.000      93
1403.000      91
1701.000      87
1250.000      85
1685.000      83
1441.000      82
1308.000      80
1285.000      80
1613.000      75
1110.000      75
1279.000      72
1364.000      70
1345.000      67
1733.000      65
1325.000      64
1209.000      64
1071.000      64
1141.000      64
1230.000      63
1845.000      56
1090.000      54
1052.000      53
1154.000      52
1664.000      52
1513.000      51
1065.000      50
1237.000      49
1088.000      46
1205.000      46
1440.000      46
1265.000  

## CO2 Emission

In [156]:
first_looking_col("co2_emission")

column name    :  co2_emission
--------------------------------
per_of_nulls   :  % 15.3
num_of_nulls   :  2436
num_of_uniques :  120
NaN        2436
120.000     740
99.000      545
97.000      537
104.000     501
10.000      477
103.000     445
114.000     382
124.000     372
107.000     362
108.000     362
119.000     361
106.000     349
128.000     329
126.000     282
85.000      275
118.000     270
110.000     266
127.000     257
117.000     254
111.000     237
113.000     235
109.000     234
139.000     224
140.000     218
129.000     213
135.000     202
105.000     202
9.000       198
130.000     180
123.000     178
150.000     174
11.000      171
143.000     171
95.000      161
116.000     157
141.000     156
98.000      151
133.000     145
136.000     145
137.000     133
125.000     132
134.000     130
145.000     126
149.000     117
153.000     113
147.000     109
101.000     105
13.000      100
115.000      86
1.000        84
121.000      82
138.000      75
93.000       66
14

In [157]:
df["co2_emission"].fillna("-", inplace=True)

In [158]:
df.groupby(["make_model", "body_type","co2_emission"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,co2_emission,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,1.0,1.0,20881.0,,20881.0,20881.0,20881.0,20881.0,20881.0
Audi A1,Compact,9.0,1.0,12479.0,,12479.0,12479.0,12479.0,12479.0,12479.0
Audi A1,Compact,10.0,147.0,16946.871,2850.063,9950.0,15765.0,16450.0,18333.0,29150.0
Audi A1,Compact,11.0,25.0,19960.36,2676.262,14930.0,17970.0,19990.0,21460.0,28980.0
Audi A1,Compact,14.0,2.0,28750.0,268.701,28560.0,28655.0,28750.0,28845.0,28940.0
Audi A1,Compact,90.0,1.0,12880.0,,12880.0,12880.0,12880.0,12880.0,12880.0
Audi A1,Compact,91.0,5.0,13038.0,1108.927,11800.0,11850.0,13800.0,13800.0,13940.0
Audi A1,Compact,94.0,5.0,13960.0,1232.071,12900.0,12900.0,13900.0,14200.0,15900.0
Audi A1,Compact,97.0,113.0,15449.593,2445.755,10900.0,13770.0,15400.0,16700.0,23500.0
Audi A1,Compact,98.0,33.0,16574.364,2637.087,13999.0,15290.0,15900.0,16800.0,25256.0


In [159]:
df["co2_emission"].replace("-", np.nan, inplace=True)

In [160]:
fill(df, "make_model", "body_type", "co2_emission", "median")

Number of NaN :  0
------------------
120.000    836
104.000    679
106.000    674
114.000    556
99.000     546
97.000     539
139.000    523
124.000    516
10.000     477
117.000    460
103.000    445
123.000    410
98.000     366
108.000    362
107.000    362
119.000    361
129.000    333
133.000    330
128.000    329
118.000    314
110.000    289
126.000    284
85.000     275
127.000    257
111.000    237
113.000    235
109.000    234
140.000    221
135.000    202
105.000    202
9.000      198
130.000    189
150.000    174
143.000    171
11.000     171
95.000     161
116.000    157
141.000    156
118.500    147
136.000    145
137.000    135
125.000    133
134.000    130
145.000    126
149.000    117
153.000    113
147.000    109
101.000    105
13.000     100
115.000     86
1.000       84
121.000     82
138.000     75
93.000      66
14.000      59
168.000     58
90.000      54
148.000     48
131.000     48
154.000     40
144.000     40
94.000      37
100.000     36
146.000     36
15

In [161]:
df.drop("co2_emission", axis=1, inplace=True)

## Comfort_Convenience

In [163]:
first_looking_col("comfort_convenience")

column name    :  comfort_convenience
--------------------------------
per_of_nulls   :  % 5.78
num_of_nulls   :  920
num_of_uniques :  6198
NaN                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               920
Air conditioning,Electrical side mirrors,Hill Holder,Power windows                                                                                                                                                                                                                                                                                                                   

In [164]:
fill(df, "make_model", "body_type", "comfort_convenience", "mode")

Number of NaN :  0
------------------
Air conditioning,Electrical side mirrors,Hill Holder,Power windows                                                                                                                                                                                                                                                                                                                                                                                                                                388
Air conditioning,Armrest,Automatic climate control,Cruise control,Electrical side mirrors,Leather steering wheel,Light sensor,Lumbar support,Multi-function steering wheel,Navigation system,Park Distance Control,Parking assist system sensors front,Parking assist system sensors rear,Power windows,Rain sensor,Seat heating,Start-stop system                                                                                                                                          

## Entertainment_Media

In [166]:
first_looking_col("entertainment_media")

column name    :  entertainment_media
--------------------------------
per_of_nulls   :  % 8.63
num_of_nulls   :  1374
num_of_uniques :  346
NaN                                                                                                               1374
Bluetooth,Hands-free equipment,On-board computer,Radio,USB                                                        1282
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB                                                     982
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB                                           783
On-board computer,Radio                                                                                            487
Radio                                                                                                              477
Bluetooth,CD player,Hands-free equipment,On-board computer,Radio,USB                                               465
On-board computer         

In [167]:
fill(df, "make_model", "body_type", "entertainment_media", "mode")

Number of NaN :  0
------------------
Bluetooth,Hands-free equipment,On-board computer,Radio,USB                                                        1738
Bluetooth,Hands-free equipment,MP3,On-board computer,Radio,USB                                                    1134
Bluetooth,CD player,Hands-free equipment,MP3,On-board computer,Radio,USB                                          1010
On-board computer                                                                                                  615
Radio                                                                                                              558
Bluetooth,Hands-free equipment,On-board computer,Radio                                                             515
On-board computer,Radio                                                                                            487
Bluetooth,CD player,Hands-free equipment,On-board computer,Radio,USB                                               466
Bluetooth,

## Extras

In [168]:
first_looking_col("extras")

column name    :  extras
--------------------------------
per_of_nulls   :  % 18.61
num_of_nulls   :  2962
num_of_uniques :  659
Alloy wheels                                                                                                                                   3245
NaN                                                                                                                                            2962
Alloy wheels,Touch screen                                                                                                                       697
Alloy wheels,Voice Control                                                                                                                      577
Alloy wheels,Touch screen,Voice Control                                                                                                         541
Alloy wheels,Roof rack                                                                                                             

In [169]:
fill(df, "make_model", "body_type", "extras", "mode")

Number of NaN :  0
------------------
Alloy wheels                                                                                                                                   5786
Alloy wheels,Touch screen                                                                                                                       697
Roof rack                                                                                                                                       596
Alloy wheels,Voice Control                                                                                                                      582
Alloy wheels,Touch screen,Voice Control                                                                                                         544
Alloy wheels,Roof rack                                                                                                                          529
Alloy wheels,Sport seats                                                  

## Safety_Security

In [170]:
first_looking_col("safety_security")

column name    :  safety_security
--------------------------------
per_of_nulls   :  % 6.17
num_of_nulls   :  982
num_of_uniques :  4443
NaN                                                                                                                                                                                                                                                                                                                   982
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                                                          538
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                          

In [171]:
fill(df, "make_model", "body_type", "safety_security", "mode")

Number of NaN :  0
------------------
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                    729
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Immobilizer,Isofix,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                                                               480
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Fog lights,Immobilizer,Isofix,LED Daytime Running Lights,Passenger-side airbag,Power steering,Side airbag,Tire pressure monitoring system,Traction control                         373
ABS,Central door lock,Daytime running lights,Driver-side airbag,Electronic stability control,Immobilizer,Isofix,Passenger-side airbag,

## cons_comb

In [173]:
first_looking_col("cons_comb")

column name    :  cons_comb
--------------------------------
per_of_nulls   :  % 12.77
num_of_nulls   :  2033
num_of_uniques :  72
NaN       2033
5.400      770
3.900      733
4.000      713
5.100      657
4.400      623
5.600      618
4.700      602
3.800      585
4.800      546
5.000      545
4.500      523
5.200      454
4.200      435
4.600      426
4.900      393
5.300      380
5.500      380
5.900      369
3.700      369
5.700      342
4.100      342
6.000      331
4.300      307
3.300      307
3.500      288
6.200      216
3.600      194
6.300      181
6.100      175
5.800      164
6.600      148
6.800      136
3.400      106
6.400       75
3.000       69
7.400       66
6.700       43
6.500       43
7.100       38
10.000      34
6.900       27
3.200       25
8.300       20
7.600       14
7.000       10
3.100        7
7.200        6
7.800        6
8.000        5
51.000       4
8.600        4
8.700        3
1.600        3
7.900        3
38.000       2
0.000        2
40.000       2

In [174]:
cons_comb = (df["cons_country"] + df["cons_city"])/2

In [179]:
df["cons_comb"] = df["cons_comb"].fillna(cons_comb)

In [180]:
df["cons_comb"].fillna("-", inplace=True)

In [181]:
df.groupby(["make_model", "body_type","cons_comb"]).price.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,cons_comb,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,3.0,4.0,14370.25,4481.393,11100.0,11242.5,12840.5,15968.25,20700.0
Audi A1,Compact,3.4,6.0,12755.0,900.75,11800.0,11987.5,12640.0,13570.0,13800.0
Audi A1,Compact,3.5,3.0,14613.0,1848.098,12479.0,14079.5,15680.0,15680.0,15680.0
Audi A1,Compact,3.6,5.0,13960.0,1232.071,12900.0,12900.0,13900.0,14200.0,15900.0
Audi A1,Compact,3.7,50.0,15902.2,2960.324,12900.0,13900.0,15040.0,15967.5,23500.0
Audi A1,Compact,3.8,18.0,16158.056,2265.763,10900.0,15240.0,16900.0,17927.5,18880.0
Audi A1,Compact,3.9,41.0,15771.902,1567.307,10490.0,15780.0,16445.0,16680.0,16950.0
Audi A1,Compact,4.0,79.0,16795.671,2620.874,12200.0,14990.0,15850.0,18583.5,23700.0
Audi A1,Compact,4.1,38.0,16540.737,1754.464,12750.0,15870.0,16430.0,16869.75,22390.0
Audi A1,Compact,4.2,87.0,15631.126,2420.347,10900.0,13970.0,15599.0,16822.5,25256.0


In [182]:
df["cons_comb"].replace([0.0, 1.0, 1.2, 1.6, 10, 11, 13.8, 32.0, 33.0, 38.0, 40.0, 43.0, 46.0, 50.0, 51.0, 54.0, 55.0, "-"], np.nan, inplace=True)

In [183]:
df["cons_comb"].value_counts(dropna=False)

NaN      1983
5.400     770
3.900     733
4.000     713
5.100     657
4.400     623
5.600     618
4.700     607
3.800     585
4.800     546
5.000     545
4.500     524
5.200     454
4.200     435
4.600     426
4.900     393
5.500     380
5.300     380
5.900     369
3.700     369
4.100     342
5.700     342
6.000     331
3.300     307
4.300     307
3.500     288
6.200     216
3.600     194
6.300     181
6.100     175
5.800     165
6.600     148
6.800     136
3.400     125
6.400      75
3.000      69
7.400      66
6.500      43
6.700      43
7.100      38
3.600      37
6.900      27
3.200      25
8.300      20
3.650      20
5.150      19
7.600      14
7.000      10
3.100       7
7.800       6
7.200       6
8.000       5
8.600       4
8.700       3
3.950       3
7.900       3
7.300       2
5.450       2
8.100       2
7.500       1
9.100       1
4.550       1
Name: cons_comb, dtype: int64

In [184]:
df.groupby(["make_model", "body_type", "cons_comb"])["make_model", "body_type", "cons_comb"].head()

Unnamed: 0,make_model,body_type,cons_comb
0,Audi A1,Sedans,3.800
1,Audi A1,Sedans,5.600
2,Audi A1,Sedans,3.800
3,Audi A1,Sedans,3.800
4,Audi A1,Sedans,4.100
...,...,...,...
15889,Renault Espace,Station wagon,7.400
15890,Renault Espace,Station wagon,5.700
15902,Renault Espace,Sedans,5.300
15905,Renault Espace,Compact,6.800


In [185]:
fill(df, "make_model", "body_type", "cons_comb", "median")

Number of NaN :  0
------------------
4.000    1020
5.600     965
5.400     900
5.000     810
4.700     766
3.900     733
4.500     731
4.400     668
5.100     657
3.800     585
4.800     580
4.200     579
4.900     535
5.300     530
5.200     454
4.600     426
5.500     389
5.900     369
3.700     369
4.100     362
5.700     342
6.000     331
4.300     307
3.300     307
3.500     288
6.200     219
3.600     194
6.300     181
6.100     175
5.800     165
6.600     148
6.800     136
3.400     125
6.400      75
3.000      69
7.400      66
6.500      43
6.700      43
7.100      38
3.600      37
6.900      27
3.200      25
8.300      20
3.650      20
5.150      19
3.450      19
7.600      14
7.000      10
3.100       7
7.800       6
7.200       6
8.000       5
8.600       4
7.900       3
8.700       3
3.950       3
4.550       2
8.100       2
7.300       2
5.450       2
9.100       1
7.500       1
4.300       1
Name: cons_comb, dtype: int64


## cons_country and cons_city

In [186]:
df.drop(["cons_country","cons_city"], axis = 1, inplace = True)

## End of this phase

In [187]:
df.shape

(15919, 25)

In [188]:
df.isnull().sum()/df.shape[0]*100

make_model            0.000
body_type             0.000
price                 0.000
vat                   0.000
km                    0.000
hp                    0.000
type                  0.000
fuel                  0.000
comfort_convenience   0.000
entertainment_media   0.000
extras                0.000
safety_security       0.000
gears                 0.000
age                   0.000
previous_owners       0.000
inspection_new        0.000
paint_type            0.000
upholstery_type       0.000
nr_of_doors           0.000
nr_of_seats           0.000
gear_type             0.000
displacement_cc       0.000
weight_kg             0.000
drive_chain           0.000
cons_comb             0.000
dtype: float64

In [189]:
df.to_csv("filled_scout.csv", index=False)