In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

import ml_util as m_util

In [2]:
# Read in CSV file
car_df = pd.read_csv('car_price_prediction_data.csv')

In [3]:
# LogisticRegression(), SVC(), LinearRegression(),  DecisionTreeRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), ExtraTreesRegressor()
models_to_test = [RandomForestRegressor()]

In [4]:
car_df = car_df[(car_df['Price'] >= 1000) & (car_df['Price'] <= 100000)]

In [5]:
# m_util.check_model_accuracy(car_df, models_to_test)

In [6]:
X_train, X_test, y_train, y_test = m_util.preprocess_car_data(car_df)

In [7]:
random_tuned_model = RandomForestRegressor()

In [8]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_features': [None, 'sqrt'],  # Number of features to consider at every split
    'max_depth': [None, 10, 20, 30],  # Maximum number of levels in tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

In [9]:
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(random_tuned_model, param_grid, random_state=0, verbose=3)

In [10]:
random_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.372 total time=   3.8s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.394 total time=   3.2s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.396 total time=   3.3s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.370 total time=   3.1s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200;, score=0.361 total time=   3.3s
[CV 1/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=2, min_samples_split=5, n_estimators=100;, score=0.571 total time=  35.8s
[CV 2/5] END bootstr

In [11]:
print(random_clf.best_params_)

{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': None, 'bootstrap': True}


In [22]:
tuned_model = RandomForestRegressor(n_estimators=350, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=None, bootstrap=True)

In [23]:
tuned_model.fit(X_train, y_train)

In [24]:
print(f"Train Score: {tuned_model.score(X_train, y_train)}")
print(f"Test Score: {tuned_model.score(X_test, y_test)}")

Train Score: 0.9693505557318618
Test Score: 0.7986109106211777
