# Dataset fields description
fields:
- Maker: Brand name 
- Genmodel: Model name
- Genmodel_ID: Unique model ID
- Adv_ID: Advertisement_ID: unique key for each row
- Adv_Year: Advertisement year
- Adv_Month: Advertisement month
- Color
- Reg_Year: Registration year
- Bodytype
- Mileage
- Engine_Size: Engine capacity in liters
- Gearbox: type of the gearbox
- Fuel_Type
- Price: in USD
- Engine_Power: in Horse Power
- Annual_Tax
- Wheelbase
- Height
- Width
- Length
- Average_Mpg: miles per gallon
- Top_Speed: mph
- Seat_Num
- Door_Num


In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('capstone_data.csv')

### Renaming columns with typos etc.

In [3]:
names = df.columns.values

for i in range(len(names)):
    names[i] = names[i].strip().capitalize()
    if "_" in names[i]:
        split1 = names[i].split("_")[0]
        split2 = names[i].split("_")[1]
        if split2.lower() == "id":
            split2 = split2.upper()
        else:
            split2 = split2.capitalize()      
        names[i] = split1 + "_" + split2

df = df.set_axis(labels=names, axis="columns")

In [4]:
df.rename(columns={"Engin_Size": "Engine_Size", "Runned_Miles": "Mileage", "Bodytype": "Body_Type", "Gearbox": "Gearbox_Type"}, inplace=True)
df.columns

Index(['Maker', 'Genmodel', 'Genmodel_ID', 'Adv_ID', 'Adv_Year', 'Adv_Month',
       'Color', 'Reg_Year', 'Body_Type', 'Mileage', 'Engine_Size',
       'Gearbox_Type', 'Fuel_Type', 'Price', 'Engine_Power', 'Annual_Tax',
       'Wheelbase', 'Height', 'Width', 'Length', 'Average_Mpg', 'Top_Speed',
       'Seat_Num', 'Door_Num'],
      dtype='object')

## Pre-cleaning

In [5]:
df.dtypes

Maker            object
Genmodel         object
Genmodel_ID      object
Adv_ID           object
Adv_Year          int64
Adv_Month         int64
Color            object
Reg_Year        float64
Body_Type        object
Mileage          object
Engine_Size      object
Gearbox_Type     object
Fuel_Type        object
Price           float64
Engine_Power    float64
Annual_Tax       object
Wheelbase       float64
Height          float64
Width           float64
Length          float64
Average_Mpg      object
Top_Speed        object
Seat_Num        float64
Door_Num        float64
dtype: object

### Handling null values in wrong format

In [6]:
df = df.replace("nan", np.nan)
df[["Length", "Height", "Width", "Wheelbase"]] = df[["Length", "Height", "Width", "Wheelbase"]].replace(0, np.nan)

### Replacing $$ in Adv_ID and setting it as an index

In [7]:
def dolars(x):
    if "$$" in x:
        x = x.replace("$$", "_")
    return x

df["Adv_ID"] = df["Adv_ID"].apply(dolars)
df.head()

Unnamed: 0,Maker,Genmodel,Genmodel_ID,Adv_ID,Adv_Year,Adv_Month,Color,Reg_Year,Body_Type,Mileage,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
0,Bentley,Arnage,10_1,10_1_1,2018,4,Silver,2000.0,Saloon,60000,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
1,Bentley,Arnage,10_1,10_1_2,2018,6,Grey,2002.0,Saloon,44000,...,450.0,315.0,3116.0,1515.0,2125.0,5390.0,13.7 mpg,179 mph,5.0,4.0
2,Bentley,Arnage,10_1,10_1_3,2017,11,Blue,2002.0,Saloon,55000,...,400.0,315.0,3116.0,1515.0,2125.0,5390.0,14.7 mpg,155 mph,5.0,4.0
3,Bentley,Arnage,10_1,10_1_4,2018,4,Green,2003.0,Saloon,14000,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
4,Bentley,Arnage,10_1,10_1_5,2017,11,Grey,2003.0,Saloon,61652,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0


In [8]:
df["Adv_ID"].nunique()

268255

In [9]:
len(df)

268255

In [10]:
df = df.set_index("Adv_ID")

### Finding duplicats

In [11]:
df[df.duplicated()]

Unnamed: 0_level_0,Maker,Genmodel,Genmodel_ID,Adv_Year,Adv_Month,Color,Reg_Year,Body_Type,Mileage,Engine_Size,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
Adv_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10_5_599,Bentley,Continental,10_5,2018,4,Purple,2014.0,Convertible,4800,6.0L,...,,,2746.0,1394.0,2227.0,4806.0,,,4.0,2.0
16_5_91,Chevrolet,Captiva,16_5,2018,8,Black,2012.0,SUV,115000,2.2L,...,181.00,315,2705.0,1720.0,1850.0,4635.0,36.6 mpg,,7.0,5.0
18_10_374,Citroen,C3 Picasso,18_10,2017,12,Silver,2017.0,MPV,10,1.6L,...,98.63,140*,2540.0,1624.0,1766.0,4078.0,72.4 mpg,111 mph,5.0,5.0
18_12_892,Citroen,C4,18_12,2018,8,Blue,2007.0,Hatchback,186000,2.0L,...,138.00,140,2608.0,1458.0,1773.0,4260.0,51.4 mpg,129 mph,5.0,5.0
18_13_70,Citroen,C4 Cactus,18_13,2018,4,White,2018.0,Hatchback,10,1.6L,...,99.00,140*,2595.0,1540.0,1946.0,4157.0,83.1 mpg,114 mph,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96_3_163,Volvo,S40,96_3,2018,7,Silver,2007.0,Saloon,79500,2.0L,...,136.00,195,2640.0,1452.0,1770.0,4468.0,48.7 mpg,127 mph,5.0,4.0
96_4_340,Volvo,S60,96_4,2018,6,Grey,2018.0,Saloon,18,2.0L,...,190.00,140*,2776.0,1484.0,2097.0,4635.0,48.7 mpg,143 mph,5.0,4.0
96_6_82,Volvo,S90,96_6,2018,5,Black,2018.0,Saloon,100,2.0L,...,190.00,140*,2941.0,1443.0,2019.0,4963.0,64.2 mpg,140 mph,5.0,4.0
96_7_266,Volvo,V40,96_7,2018,4,White,2016.0,Hatchback,15094,2.0L,...,120.00,0,2647.0,1439.0,2041.0,4369.0,78.5 mpg,118 mph,5.0,5.0


In [12]:
df = df.drop_duplicates(keep="first")
len(df)

267366

### Deleting rows with Adv_Month greater than 12

In [13]:
df[df["Adv_Month"] > 12]

Unnamed: 0_level_0,Maker,Genmodel,Genmodel_ID,Adv_Year,Adv_Month,Color,Reg_Year,Body_Type,Mileage,Engine_Size,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
Adv_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29_32_75,Ford,Streetka,29_32,2012,13,Blue,2005.0,Convertible,22000,1.6L,...,94.0,,2450.0,1332.0,1852.0,3650.0,35.8 mpg,108 mph,2.0,2.0
69_23_2,Peugeot,407,69_23,2012,17,Blue,2006.0,Coupe,77000,2.7L,...,205.0,,2725.0,1399.0,1868.0,4815.0,33.2 mpg,143 mph,4.0,2.0
77_4_10,Rolls-Royce,Phantom,77_4,2012,33,Black,2007.0,Saloon,20000,6.7L,...,,555.0,,,,,,,5.0,


In [14]:
df.drop(df[df["Adv_Month"] > 12].index, inplace=True)

In [15]:
def to_month_abr(x):
    months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    if str(x).isnumeric():
        x = months[x - 1]
    return x

In [16]:
df["Adv_Month"] = df["Adv_Month"].apply(to_month_abr).astype("category")

### Removing units or non-numerical characters from numeric columns

In [17]:
def remove_units(x):
    if x is not np.nan:
        output = ""
        x = str(x)
        for i in range(len(x)):
            if x[i].isnumeric() or x[i] == ".":
                output += x[i]
        if output == "":
            x = np.nan
        else:
            x = float(output)
    return x

In [18]:
df["Engine_Size"] = df["Engine_Size"].apply(remove_units)
df["Average_Mpg"] = df["Average_Mpg"].apply(remove_units)
df["Top_Speed"] = df["Top_Speed"].apply(remove_units)
df["Mileage"] = df["Mileage"].apply(remove_units)
df["Annual_Tax"] = df["Annual_Tax"].apply(remove_units)
df.dtypes

Maker             object
Genmodel          object
Genmodel_ID       object
Adv_Year           int64
Adv_Month       category
Color             object
Reg_Year         float64
Body_Type         object
Mileage          float64
Engine_Size      float64
Gearbox_Type      object
Fuel_Type         object
Price            float64
Engine_Power     float64
Annual_Tax       float64
Wheelbase        float64
Height           float64
Width            float64
Length           float64
Average_Mpg      float64
Top_Speed        float64
Seat_Num         float64
Door_Num         float64
dtype: object

### Genmodel - managing data-like formats

In [19]:
df["Genmodel"] = df["Genmodel"].replace("09-Mar", "9-3")
df["Genmodel"] = df["Genmodel"].replace("09-May", "9-5")

### Engine_Size - correcting outliers

In [20]:
df[df["Engine_Size"].isnull() == False]["Engine_Size"].sort_values().tail()

Adv_ID
71_8_487       8.2
87_3_349      10.0
64_6_2      1400.0
62_11_3     2998.0
64_8_77     3500.0
Name: Engine_Size, dtype: float64

In [21]:
def eng_size_outliers(x):
    if x > 1000:
        x = x/1000
    return x

df["Engine_Size"] = df["Engine_Size"].apply(eng_size_outliers)

### Fuel_Types - merging duplicate categories

In [22]:
df["Fuel_Type"].unique()

array(['Petrol', 'Diesel', 'Petrol Plug-in Hybrid', 'Petrol Ethanol',
       'Hybrid  Petrol/Electric Plug-in', 'Electric', nan,
       'Hybrid  Diesel/Electric', 'Bi Fuel', 'Hybrid  Petrol/Electric',
       'Petrol Hybrid', 'Diesel Hybrid', 'Diesel Plug-in Hybrid',
       'Hybrid  Diesel/Electric Plug-in'], dtype=object)

In [23]:
df["Fuel_Type"] = df["Fuel_Type"].apply(lambda x: str(x).replace("  ", " "))
df["Fuel_Type"].unique()

array(['Petrol', 'Diesel', 'Petrol Plug-in Hybrid', 'Petrol Ethanol',
       'Hybrid Petrol/Electric Plug-in', 'Electric', 'nan',
       'Hybrid Diesel/Electric', 'Bi Fuel', 'Hybrid Petrol/Electric',
       'Petrol Hybrid', 'Diesel Hybrid', 'Diesel Plug-in Hybrid',
       'Hybrid Diesel/Electric Plug-in'], dtype=object)

In [24]:
df["Fuel_Type"] = df["Fuel_Type"].replace("Hybrid Petrol/Electric Plug-in", "Petrol Plug-in Hybrid") 
df["Fuel_Type"] = df["Fuel_Type"].replace("Hybrid Petrol Plug-in", "Petrol Plug-in Hybrid")
df["Fuel_Type"] = df["Fuel_Type"].replace("Hybrid Diesel Plug-in", "Diesel Plug-in Hybrid")
df["Fuel_Type"] = df["Fuel_Type"].replace("Hybrid Diesel/Electric Plug-in", "Diesel Plug-in Hybrid")
df["Fuel_Type"] = df["Fuel_Type"].replace("Hybrid Diesel/Electric", "Diesel Hybrid")
df["Fuel_Type"] = df["Fuel_Type"].replace("Hybrid Petrol/Electric", "Petrol Hybrid")
# There should be 10 resulting categories including NaN

In [25]:
df["Fuel_Type"] = df["Fuel_Type"].replace("nan", np.nan)
df["Fuel_Type"].unique()

array(['Petrol', 'Diesel', 'Petrol Plug-in Hybrid', 'Petrol Ethanol',
       'Electric', nan, 'Diesel Hybrid', 'Bi Fuel', 'Petrol Hybrid',
       'Diesel Plug-in Hybrid'], dtype=object)

### Mpg

In [26]:
df["Average_Mpg"].describe()

count    225552.000000
mean         50.902868
std          15.807056
min           9.400000
25%          41.500000
50%          50.400000
75%          60.100000
max         471.000000
Name: Average_Mpg, dtype: float64

In [27]:
df[df["Average_Mpg"] > 100]["Fuel_Type"].unique()

array(['Petrol Plug-in Hybrid', 'Petrol Hybrid', 'Diesel Plug-in Hybrid',
       'Electric'], dtype=object)

### Removing outliers

In [28]:
df[df["Price"] > 3000000]

Unnamed: 0_level_0,Maker,Genmodel,Genmodel_ID,Adv_Year,Adv_Month,Color,Reg_Year,Body_Type,Mileage,Engine_Size,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
Adv_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58_6_13,McLaren,675LT,58_6,2017,Jun,Silver,2016.0,Coupe,200.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,,205.0,2.0,2.0


In [29]:
df[df["Genmodel"] == "675LT"]

Unnamed: 0_level_0,Maker,Genmodel,Genmodel_ID,Adv_Year,Adv_Month,Color,Reg_Year,Body_Type,Mileage,Engine_Size,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
Adv_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
58_6_1,McLaren,675LT,58_6,2017,Jun,Orange,2016.0,Convertible,193.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,11.7,203.0,2.0,2.0
58_6_2,McLaren,675LT,58_6,2017,Oct,Orange,2016.0,Coupe,662.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,,205.0,2.0,2.0
58_6_3,McLaren,675LT,58_6,2018,May,Grey,2016.0,Convertible,1100.0,3.8,...,,555.0,,,,,,,2.0,
58_6_4,McLaren,675LT,58_6,2018,Feb,Black,2016.0,Convertible,840.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,11.7,203.0,2.0,2.0
58_6_5,McLaren,675LT,58_6,2017,Sep,Orange,2016.0,Coupe,9843.0,3.8,...,,555.0,,,,,,,2.0,
58_6_6,McLaren,675LT,58_6,2018,Jan,Orange,2016.0,Coupe,2850.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,,205.0,2.0,2.0
58_6_7,McLaren,675LT,58_6,2018,May,White,2016.0,Convertible,2402.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,11.7,203.0,2.0,2.0
58_6_8,McLaren,675LT,58_6,2018,Jan,Black,2016.0,Coupe,2800.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,,205.0,2.0,2.0
58_6_9,McLaren,675LT,58_6,2018,Mar,Purple,2015.0,Coupe,800.0,3.8,...,666.0,555.0,2670.0,1188.0,2095.0,4546.0,,205.0,2.0,2.0
58_6_10,McLaren,675LT,58_6,2018,May,White,2015.0,Coupe,4000.0,3.8,...,,555.0,,,,,,,2.0,


In [30]:
df.drop(df[df["Price"] > 3000000].index, inplace=True)


In [31]:
# df["Reg_Year"].sort_values().head()

In [32]:
df[df["Reg_Year"] == 1900]

Unnamed: 0_level_0,Maker,Genmodel,Genmodel_ID,Adv_Year,Adv_Month,Color,Reg_Year,Body_Type,Mileage,Engine_Size,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
Adv_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
94_9_342,Vauxhall,Corsa,94_9,2018,May,Grey,1900.0,Hatchback,4386.0,1.4,...,,,,,,,,,,


In [33]:
df.drop(df["Reg_Year"].sort_values().head(1).index, inplace=True)

### Correcting data types

In [34]:
df.dtypes

Maker             object
Genmodel          object
Genmodel_ID       object
Adv_Year           int64
Adv_Month       category
Color             object
Reg_Year         float64
Body_Type         object
Mileage          float64
Engine_Size      float64
Gearbox_Type      object
Fuel_Type         object
Price            float64
Engine_Power     float64
Annual_Tax       float64
Wheelbase        float64
Height           float64
Width            float64
Length           float64
Average_Mpg      float64
Top_Speed        float64
Seat_Num         float64
Door_Num         float64
dtype: object

In [35]:
df.nunique()

Maker              88
Genmodel          896
Genmodel_ID       881
Adv_Year            9
Adv_Month          12
Color              22
Reg_Year           25
Body_Type          18
Mileage         72420
Engine_Size        72
Gearbox_Type        3
Fuel_Type           9
Price           20454
Engine_Power      466
Annual_Tax         24
Wheelbase         426
Height            578
Width             475
Length            983
Average_Mpg       358
Top_Speed         138
Seat_Num           10
Door_Num            7
dtype: int64

In [36]:
df["Adv_Year"] = df["Adv_Year"].astype("category")
df["Gearbox_Type"] = df["Gearbox_Type"].astype("category")
df["Fuel_Type"] = df["Fuel_Type"].astype("category")

### Reordering columns

In [37]:
col_names = df.columns
col_names

Index(['Maker', 'Genmodel', 'Genmodel_ID', 'Adv_Year', 'Adv_Month', 'Color',
       'Reg_Year', 'Body_Type', 'Mileage', 'Engine_Size', 'Gearbox_Type',
       'Fuel_Type', 'Price', 'Engine_Power', 'Annual_Tax', 'Wheelbase',
       'Height', 'Width', 'Length', 'Average_Mpg', 'Top_Speed', 'Seat_Num',
       'Door_Num'],
      dtype='object')

In [38]:
new_order = ['Maker', 'Genmodel', 'Genmodel_ID', 'Adv_Year', 'Adv_Month', 'Color', 'Body_Type', 'Gearbox_Type', 'Fuel_Type', 'Reg_Year', 'Mileage', 'Engine_Size','Price', 'Engine_Power', 'Annual_Tax', 'Wheelbase', 'Height', 'Width', 'Length', 'Average_Mpg', 'Top_Speed', 'Seat_Num', 'Door_Num']
len(new_order)

23

In [39]:
df = df.reindex(new_order, axis="columns")
df.head()

Unnamed: 0_level_0,Maker,Genmodel,Genmodel_ID,Adv_Year,Adv_Month,Color,Body_Type,Gearbox_Type,Fuel_Type,Reg_Year,...,Engine_Power,Annual_Tax,Wheelbase,Height,Width,Length,Average_Mpg,Top_Speed,Seat_Num,Door_Num
Adv_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10_1_1,Bentley,Arnage,10_1,2018,Apr,Silver,Saloon,Automatic,Petrol,2000.0,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
10_1_2,Bentley,Arnage,10_1,2018,Jun,Grey,Saloon,Automatic,Petrol,2002.0,...,450.0,315.0,3116.0,1515.0,2125.0,5390.0,13.7,179.0,5.0,4.0
10_1_3,Bentley,Arnage,10_1,2017,Nov,Blue,Saloon,Automatic,Petrol,2002.0,...,400.0,315.0,3116.0,1515.0,2125.0,5390.0,14.7,155.0,5.0,4.0
10_1_4,Bentley,Arnage,10_1,2018,Apr,Green,Saloon,Automatic,Petrol,2003.0,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0
10_1_5,Bentley,Arnage,10_1,2017,Nov,Grey,Saloon,Automatic,Petrol,2003.0,...,,,3116.0,1515.0,2125.0,5390.0,,,5.0,4.0


### Filling null values with Genmodel mode

In [40]:
def null_to_mode(data_frame, grouping_column, column_to_fill):
    return data_frame[column_to_fill].fillna(data_frame.groupby(grouping_column)[column_to_fill].transform(
        lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
    )

In [41]:
df["Seat_Num"].info()

<class 'pandas.core.series.Series'>
Index: 267361 entries, 10_1_1 to 99_1_3
Series name: Seat_Num
Non-Null Count   Dtype  
--------------   -----  
260923 non-null  float64
dtypes: float64(1)
memory usage: 4.1+ MB


In [42]:
null_to_mode(df, "Genmodel_ID", "Seat_Num").info()

<class 'pandas.core.series.Series'>
Index: 267361 entries, 10_1_1 to 99_1_3
Series name: Seat_Num
Non-Null Count   Dtype  
--------------   -----  
267296 non-null  float64
dtypes: float64(1)
memory usage: 4.1+ MB


In [43]:
df["Body_Type"] = null_to_mode(df, "Genmodel_ID", "Body_Type")
df["Gearbox_Type"] = null_to_mode(df, "Genmodel_ID", "Gearbox_Type")
df["Fuel_Type"] = null_to_mode(df, "Genmodel_ID", "Fuel_Type")
df["Seat_Num"] = null_to_mode(df, "Genmodel_ID", "Seat_Num")
df["Door_Num"] = null_to_mode(df, "Genmodel_ID", "Door_Num")

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 267361 entries, 10_1_1 to 99_1_3
Data columns (total 23 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   Maker         267361 non-null  object  
 1   Genmodel      267361 non-null  object  
 2   Genmodel_ID   267361 non-null  object  
 3   Adv_Year      267361 non-null  category
 4   Adv_Month     267361 non-null  category
 5   Color         245616 non-null  object  
 6   Body_Type     267342 non-null  object  
 7   Gearbox_Type  267355 non-null  category
 8   Fuel_Type     267359 non-null  category
 9   Reg_Year      267355 non-null  float64 
 10  Mileage       266318 non-null  float64 
 11  Engine_Size   265308 non-null  float64 
 12  Price         266235 non-null  float64 
 13  Engine_Power  235665 non-null  float64 
 14  Annual_Tax    219941 non-null  float64 
 15  Wheelbase     232043 non-null  float64 
 16  Height        238705 non-null  float64 
 17  Width         239367 non-null

In [45]:
df.to_csv("/Users/mpolcyn/PycharmProjects/DataProcessingCapstone/capstone_data_clean.csv", index=False)