In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [3]:
car_sales = pd.read_csv("data/cars-ext-missing.csv")

In [4]:
car_sales.shape

(1000, 5)

In [5]:
car_sales.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [6]:
car_sales["Make"].fillna("missing", inplace = True)
car_sales["Colour"].fillna("missing", inplace = True)      
car_sales["Odometer (KM)"].fillna(car_sales['Odometer (KM)'].mean(), inplace=True)
car_sales["Doors"].fillna(4, inplace=True)

In [7]:
car_sales.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [8]:
car_sales.dropna(subset=['Price'], inplace=True)

In [9]:
car_sales.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [10]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


In [15]:
cat_imp = SimpleImputer(strategy="constant", fill_value="missing")
door_imp = SimpleImputer(strategy="constant", fill_value=4)
num_imp = SimpleImputer(strategy="mean")

cat_feature=["Make", "Colour"]
door_feature = ["Doors"]
num_feature = ["Odometer (KM)"]

imputer = ColumnTransformer([
    ("cat_imp", cat_imp, cat_feature),
    ("door_feature", door_imp, door_feature),
    ("num_fearure", num_imp, num_feature)
])

In [16]:
# Transform the data
X = car_sales.drop("Price", axis=1)
filled_X = imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [21]:
car_sales_filled = pd.DataFrame(filled_X, columns= ["Make" , "Colour","Doors","Odometer"] )
car_sales_filled.isna().sum()
                                                    

Make        0
Colour      0
Doors       0
Odometer    0
dtype: int64

In [28]:
categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot, categorical_features)], remainder="passthrough")
transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X
y= car_sales["Price"]

In [36]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.3)

In [37]:
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.2397454343710379