In [None]:
# Organize the data
# Sort missing values
# Change to numerical
# Split the data
# Instantiate the model
# Make predictions

In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from joblib import dump, load

In [None]:
data = pd.read_csv("car-sales-extended-missing-data.csv")
data.dtypes

In [None]:
data.dropna(subset="Price", inplace=True)

In [None]:
data.isna().sum()

In [None]:
X = data.drop("Price", axis=1)
y = data["Price"]

In [None]:
cat_features = ["Make", "Colour"]
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

door_features = ["Doors"]
door_imputer = SimpleImputer(strategy="constant", fill_value=4)

odo_features = ["Odometer (KM)"]
odo_imputer = SimpleImputer(strategy="mean")

one_hot = OneHotEncoder()

transformer = ColumnTransformer([
    ("cat", cat_imputer, cat_features),
    ("door", door_imputer, door_features),
    ("odo", odo_imputer, odo_features)
])

In [None]:
transformed_X = transformer.fit_transform(X)

In [None]:
X_df = pd.DataFrame(data=transformed_X, columns=["Make", "Colour", "Doors", "Odometer (KM)"])

In [None]:
X_df.head()

In [None]:
one_hot = OneHotEncoder()

column_data = ["Make", "Colour", "Doors", "Odometer (KM)"]
transformer = ColumnTransformer([
    ("one_hot", one_hot, column_data)
], sparse_threshold=0, remainder="passthrough")

In [None]:
transformed_X = transformer.fit_transform(X_df)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model = RandomForestRegressor(n_jobs=-1).fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
model.get_params()

In [None]:
X_df.isna().sum()

In [None]:
param_distributions = {
    "n_estimators": [100, 400, 700, 1000, 1200],
    "max_depth": [2, 6, 10, 14],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [2, 4, 6, 8, 10],
    "max_features": [2, 4, 6, 8, 10],
}

rs_model = RandomizedSearchCV(model, param_distributions, n_iter=20, cv=5, verbose=2).fit(X_test, y_test)

In [None]:
rs_model.best_params_

In [None]:
rs_model.score(X_test, y_test)

In [None]:
dump(rs_model, filename="test.joblib")

# Try using Pipline()

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from joblib import dump, load

In [2]:
data = pd.read_csv("car-sales-extended-missing-data.csv")
data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [3]:
data.dropna(subset="Price", inplace=True)
data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64