In [1]:
import pandas as pd
import numpy as np
import urllib.request
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model

import warnings
warnings.filterwarnings("ignore")

In [2]:
# download dataset

data_url = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv"

filename = "cars_data.csv"

if not os.path.exists(filename):
    urllib.request.urlretrieve(data_url, filename)

df_in = pd.read_csv(filename)

In [3]:
def df_clean(df):
    """
    Clean the dataframe - convert column names to lower case and replace spaces 
    with underscores.
    Remove NaNs.
    Modifies df in place.
    """
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    
    string_columns = df.dtypes[df.dtypes == "object"].index

    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(" ", "_")

    df = df.fillna(0)

    return df

# clean the dataframe
df_in  = df_clean(df_in)

In [10]:
MAKES = [
            "chevrolet",
            "ford",
            "volkswagen",
            "toyota",
            "dodge",
            "nissan",
            "gmc",
            "honda",
            "mazda",
            "cadillac"
        ]

FUEL_TYPES = list(df_in.engine_fuel_type.unique())

TRANSMISSIONS = list(df_in.transmission_type.unique())

WD_TYPES = list(df_in.driven_wheels.unique())

MARKET_CATEGORIES = list(df_in.market_category.unique())

VEHICLE_SIZES = list(df_in.vehicle_size.unique())

VEHICLE_STYLES = list(df_in.vehicle_style.unique())

def process_features(df):
    """
    Process features - convert categoricals into dummies.
    Returns a new dataframe.
    """
    _df = df.copy()

    _features = ["engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "popularity"]

    # add age column
    _df["age"] = 2017 - _df.year
    _features.append("age")

    # convert num doors into dummies
    for v in [2, 3, 4]:
        _df[f"num_doors_{v}"] = (df.number_of_doors == v).astype(int)
        _features.append(f"num_doors_{v}")

    # use 10 most popular makes as dummies
    for make in MAKES:
        _df[f"make_{make}"] = (_df.make == make).astype(int)
        _features.append(f"make_{make}")

    for fuel in FUEL_TYPES:
        _df[f"fuel_{fuel}"] = (_df.engine_fuel_type == fuel).astype(int)
        _features.append(f"fuel_{fuel}")

    for transmission in TRANSMISSIONS:
        _df[f"transmission_{transmission}"] = (_df.transmission_type == transmission).astype(int)
        _features.append(f"transmission_{transmission}")

    for wd in WD_TYPES:
        _df[f"wd_{wd}"] = (_df.driven_wheels == wd).astype(int)
        _features.append(f"wd_{wd}")

    for cat in MARKET_CATEGORIES:
        _df[f"cat_{cat}"] = (_df.market_category == cat).astype(int)
        _features.append(f"cat_{cat}")

    for s in VEHICLE_SIZES:
        _df[f"size_{s}"] = (_df.vehicle_size == s).astype(int)
        _features.append(f"size_{s}")

    for s in VEHICLE_STYLES:
        _df[f"style_{s}"] = (_df.vehicle_style == s).astype(int)
        _features.append(f"style_{s}")

    _features.append("msrp")

    _features = list(filter(lambda x: not x.endswith("_0"), _features))

    return _df[_features]

In [11]:
# feature engineering

df = process_features(df_in)

In [13]:
# functional testing

# verify if column names are converted to lower case and spaces are replaced with underscores
for col in df.columns:
    assert col == col.lower().replace(" ", "_")

# verify if string columns are converted to lower case and spaces are replaced with underscores
string_columns = df.dtypes[df.dtypes == "object"].index
for col in string_columns:
    assert df[col].str.contains(" ").sum() == 0

REQUIRED_FEATURES = ["engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "popularity"]
REQUIRED_FEATURES += ["age"]
REQUIRED_FEATURES += [f"num_doors_{v}" for v in [2, 3, 4]]
REQUIRED_FEATURES += [f"make_{make}" for make in MAKES]
REQUIRED_FEATURES += [f"fuel_{fuel}" for fuel in FUEL_TYPES]
REQUIRED_FEATURES += [f"transmission_{transmission}" for transmission in TRANSMISSIONS]
REQUIRED_FEATURES += [f"wd_{wd}" for wd in WD_TYPES]
REQUIRED_FEATURES += [f"cat_{cat}" for cat in MARKET_CATEGORIES]
REQUIRED_FEATURES += [f"size_{s}" for s in VEHICLE_SIZES]
REQUIRED_FEATURES += [f"style_{s}" for s in VEHICLE_STYLES]
REQUIRED_FEATURES += ["msrp"]

REQUIRED_FEATURES = list(filter(lambda x: not x.endswith("_0"), REQUIRED_FEATURES))

assert set(df.columns) == set(REQUIRED_FEATURES)

In [14]:
# df contains pre-engineered dataset

print(f"{len(df.columns)} columns:")
print(list(df.columns))

print(list(filter(lambda x: x.endswith("_0"), df.columns)))

129 columns:
['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity', 'age', 'num_doors_2', 'num_doors_3', 'num_doors_4', 'make_chevrolet', 'make_ford', 'make_volkswagen', 'make_toyota', 'make_dodge', 'make_nissan', 'make_gmc', 'make_honda', 'make_mazda', 'make_cadillac', 'fuel_premium_unleaded_(required)', 'fuel_regular_unleaded', 'fuel_premium_unleaded_(recommended)', 'fuel_flex-fuel_(unleaded/e85)', 'fuel_diesel', 'fuel_electric', 'fuel_flex-fuel_(premium_unleaded_recommended/e85)', 'fuel_natural_gas', 'fuel_flex-fuel_(premium_unleaded_required/e85)', 'fuel_flex-fuel_(unleaded/natural_gas)', 'transmission_manual', 'transmission_automatic', 'transmission_automated_manual', 'transmission_direct_drive', 'transmission_unknown', 'wd_rear_wheel_drive', 'wd_front_wheel_drive', 'wd_all_wheel_drive', 'wd_four_wheel_drive', 'cat_factory_tuner,luxury,high-performance', 'cat_luxury,performance', 'cat_luxury,high-performance', 'cat_luxury', 'cat_performance', 'cat_flex_fuel', '