In [None]:
# Organize the data
# Sort missing values
# Change to numerical
# Split the data
# Instantiate the model
# Make predictions

In [29]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("car-sales-extended-missing-data.csv")
data.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [3]:
data.dropna(subset="Price", inplace=True)

In [4]:
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [7]:
X = data.drop("Price", axis=1)
y = data["Price"]

In [11]:
cat_features = ["Make", "Colour"]
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

door_features = ["Doors"]
door_imputer = SimpleImputer(strategy="constant", fill_value=4)

odo_features = ["Odometer (KM)"]
odo_imputer = SimpleImputer(strategy="mean")

one_hot = OneHotEncoder()

transformer = ColumnTransformer([
    ("cat", cat_imputer, cat_features),
    ("door", door_imputer, door_features),
    ("odo", odo_imputer, odo_features)
])

In [12]:
transformed_X = transformer.fit_transform(X)

In [13]:
X_df = pd.DataFrame(data=transformed_X, columns=["Make", "Colour", "Doors", "Odometer (KM)"])

In [14]:
X_df.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


In [18]:
one_hot = OneHotEncoder()

column_data = ["Make", "Colour", "Doors", "Odometer (KM)"]
transformer = ColumnTransformer([
    ("one_hot", one_hot, column_data)
], sparse_threshold=0, remainder="passthrough")

In [22]:
transformed_X = transformer.fit_transform(X_df)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model = RandomForestRegressor(n_jobs=-1).fit(X_train, y_train)

In [24]:
model.score(X_test, y_test)

0.17042528493792275

In [28]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [32]:
X_df.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [35]:
param_distributions = {
    "n_estimators": [100, 400, 700, 1000, 1200],
    "max_depth": [2, 6, 10, 14],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [2, 4, 6, 8, 10],
    "max_features": [2, 4, 6, 8, 10],
}

rs_model = RandomizedSearchCV(model, param_distributions, n_iter=20, cv=5, verbose=2).fit(X_test, y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END max_depth=6, max_features=6, min_samples_leaf=8, min_samples_split=10, n_estimators=100; total time=   1.9s
[CV] END max_depth=6, max_features=6, min_samples_leaf=8, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=6, max_features=6, min_samples_leaf=8, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=6, max_features=6, min_samples_leaf=8, min_samples_split=10, n_estimators=100; total time=   0.0s
[CV] END max_depth=6, max_features=6, min_samples_leaf=8, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END max_depth=2, max_features=10, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.3s
[CV] END max_depth=2, max_features=10, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.4s
[CV] END max_depth=2, max_features=10, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time=   0.3s

[CV] END max_depth=10, max_features=4, min_samples_leaf=10, min_samples_split=2, n_estimators=1000; total time=   0.9s
[CV] END max_depth=10, max_features=4, min_samples_leaf=10, min_samples_split=2, n_estimators=1000; total time=   0.9s
[CV] END max_depth=10, max_features=4, min_samples_leaf=10, min_samples_split=2, n_estimators=1000; total time=   0.8s
[CV] END max_depth=10, max_features=4, min_samples_leaf=10, min_samples_split=2, n_estimators=1000; total time=   0.8s
[CV] END max_depth=10, max_features=4, min_samples_leaf=10, min_samples_split=2, n_estimators=1000; total time=   0.9s
[CV] END max_depth=2, max_features=10, min_samples_leaf=10, min_samples_split=10, n_estimators=700; total time=   0.6s
[CV] END max_depth=2, max_features=10, min_samples_leaf=10, min_samples_split=10, n_estimators=700; total time=   0.6s
[CV] END max_depth=2, max_features=10, min_samples_leaf=10, min_samples_split=10, n_estimators=700; total time=   0.6s
[CV] END max_depth=2, max_features=10, min_sampl

In [36]:
rs_model.best_params_

{'n_estimators': 100,
 'min_samples_split': 4,
 'min_samples_leaf': 6,
 'max_features': 10,
 'max_depth': 14}

In [37]:
rs_model.score(X_test, y_test)

0.03576495568107152