In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, Lars, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from xgboost.sklearn import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('model_data.csv')

In [3]:
data.columns

Index(['id', 'url', 'region', 'region_url', 'price', 'year', 'make', 'model',
       'condition', 'cylinders', 'fuel', 'odometer', 'title_status',
       'transmission', 'drive', 'size', 'type', 'paint_color', 'description',
       'state', 'age', 'distance_per_year', 'make_median_price',
       'model_median_price', 'state_median_price'],
      dtype='object')

In [4]:
model_data = data.drop(['id','url','model','region','description','region_url','transmission','distance_per_year'], axis=1)

In [5]:
model_data.columns

Index(['price', 'year', 'make', 'condition', 'cylinders', 'fuel', 'odometer',
       'title_status', 'drive', 'size', 'type', 'paint_color', 'state', 'age',
       'make_median_price', 'model_median_price', 'state_median_price'],
      dtype='object')

In [6]:
y = model_data['price']
model_data.drop('price', inplace=True, axis=1)

In [7]:
model_data['make'] = pd.factorize(model_data['make'])[0]

In [8]:
model_data['state'] = pd.factorize(model_data['state'])[0]

In [9]:
model_data.columns

Index(['year', 'make', 'condition', 'cylinders', 'fuel', 'odometer',
       'title_status', 'drive', 'size', 'type', 'paint_color', 'state', 'age',
       'make_median_price', 'model_median_price', 'state_median_price'],
      dtype='object')

In [10]:
model_data['make_median_price'] = data['make_median_price']

In [18]:
model_data = pd.get_dummies(model_data)

In [19]:
X_train, X_tune, y_train, y_tune = train_test_split(model_data, y, test_size=0.30)
X_tune, X_test, y_tune, y_test = train_test_split(X_tune, y_tune, test_size=0.33)

In [20]:
X_train = pd.DataFrame(X_train)
X_tune = pd.DataFrame(X_tune)

In [14]:
X_train.shape

(265888, 16)

In [21]:
X_tune.shape

(76348, 65)

In [22]:
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_tune)
mean_absolute_error(y_tune,y_pred)

3823.510027927267

In [23]:
model = RandomForestRegressor().fit(X_train, y_train)
y_pred = model.predict(X_tune)
mean_absolute_error(y_tune,y_pred)

1640.1750913023493

In [24]:
model = DecisionTreeRegressor().fit(X_train, y_train)
y_pred = model.predict(X_tune)
mean_absolute_error(y_tune,y_pred)

2099.5219504843258

In [25]:
model = Lasso().fit(X_train, y_train)
y_pred = model.predict(X_tune)
print(mean_absolute_error(y_tune, y_pred))

3822.1093203449127


In [26]:
model = Ridge().fit(X_train, y_train)
y_pred = model.predict(X_tune)
print(mean_absolute_error(y_tune, y_pred))

3823.4929236628886


In [27]:
#>>> reg = linear_model.Lars(n_nonzero_coefs=1)
model = Lars().fit(X_train, y_train)
y_pred = model.predict(X_tune)
print(mean_absolute_error(y_tune, y_pred))

3822.3116134345364


In [28]:
#>>> reg = linear_model.Lars(n_nonzero_coefs=1)
model = ElasticNet().fit(X_train, y_train)
y_pred = model.predict(X_tune)
print(mean_absolute_error(y_tune, y_pred))

3882.5120348362934


In [29]:
#>>> reg = linear_model.Lars(n_nonzero_coefs=1)
model = XGBRegressor().fit(X_train, y_train)
y_pred = model.predict(X_tune)
print(mean_absolute_error(y_tune, y_pred))

2281.8258414475845


In [30]:
features = model_data.columns
model = LinearRegression().fit(X_train, y_train)
y_pred = model.predict(X_tune)
best_score = mean_absolute_error(y_tune,y_pred)
remove_feature = ''
print(best_score)
for feature in features:
    #print(feature)
    x = X_train.drop(feature, axis=1)
    x_tune = X_tune.drop(feature, axis=1)
    model = LinearRegression().fit(x, y_train)
    y_pred = model.predict(x_tune)
    score = mean_absolute_error(y_tune, y_pred)
    if score < best_score:
        best_score = score
        remove_feature = feature
    

3823.510027927267


In [31]:
alphas = [i/10 for i in range(0,51)]

In [32]:
errors = list()

for i in range(0,len(alphas)):
    model = Lasso(alpha=alphas[i]).fit(X_train, y_train)
    y_pred = model.predict(X_tune)
    errors.append(mean_absolute_error(y_tune, y_pred))
    
min_error = min(errors)
best_alpha = alphas[errors.index(min_error)]

In [33]:
errors = list()

for i in range(0,len(alphas)):
    model = Ridge(alpha=alphas[i]).fit(X_train, y_train)
    y_pred = model.predict(X_tune)
    errors.append(mean_absolute_error(y_tune, y_pred))
    
min_error = min(errors)
best_alpha = alphas[errors.index(min_error)]

In [34]:
min_error

3823.4260088144188

In [35]:
best_alpha

5.0

In [36]:
best_score = 0
best_params = dict()
best_params_list = list()
index = 0
best_index = 0
#max_depths = list()
max_depths = [2,3,4,5]
max_features = [2,3,4,5]
min_samples_leaf = [2,3,4,5]
max_leaf_nodes = [2,3,4,5]

print(f"Max Depths: {max_depths}")
print(f"Max Features: {max_features}")



Max Depths: [2, 3, 4, 5]
Max Features: [2, 3, 4, 5]


In [37]:
model = DecisionTreeRegressor().fit(X_train, y_train)
y_pred = model.predict(X_tune)
mean_absolute_error(y_tune,y_pred)

2101.5148835528216

In [38]:
index = 0

In [39]:
model = DecisionTreeRegressor().fit(X_train, y_train)
y_pred = model.predict(X_tune)
best_score = mean_absolute_error(y_tune,y_pred)

In [40]:
for x in max_features:
    for y in max_depths:
        for a in min_samples_leaf:
            for b in max_leaf_nodes:
                print(f"Model Number: {index}")
                model = DecisionTreeRegressor(max_features=x,max_depth=y,min_samples_leaf=a,max_leaf_nodes=b)
                model = model.fit(X_train, y_train)
                y_pred = model.predict(X_tune)
                score = mean_absolute_error(y_tune, y_pred)
                print(f"Model Score: {score} Max Depth: {y} Max Feature: {x} Min Samples/Leaf: {a} Max Leaf Nodes: {b}")
                if score < best_score:
                    best_score = score
                    best_index = index
                aa = dict()
                aa['max_depth'] = y
                aa['max_features'] = x
                aa['max_leaf_nodes'] = b
                aa['min_samples'] = a
                best_params_list.append(aa)
                index += 1

Model Number: 0
Model Score: 6892.47818335742 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 2
Model Number: 1
Model Score: 7009.207612681585 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 3
Model Number: 2
Model Score: 6599.350314869828 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 4
Model Number: 3
Model Score: 6750.406540230571 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 5
Model Number: 4
Model Score: 7060.498881399769 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 2
Model Number: 5
Model Score: 6798.603662314845 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 3
Model Number: 6
Model Score: 6116.325543936688 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 4
Model Number: 7
Model Score: 5565.8848015872 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 5
Model Number: 8
Model Score: 7058.9467683875455 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 4 Max

Model Score: 7060.498881399769 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 2
Model Number: 73
Model Score: 5720.685217371751 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 3
Model Number: 74
Model Score: 6038.369131290913 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 4
Model Number: 75
Model Score: 6891.605362050015 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 5
Model Number: 76
Model Score: 6977.993200390629 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 2
Model Number: 77
Model Score: 6871.500722045727 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 3
Model Number: 78
Model Score: 6977.477032063233 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 4
Model Number: 79
Model Score: 7070.695883744829 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 5
Model Number: 80
Model Score: 6868.043471049998 Max Depth: 3 Max Feature: 3 Min Samples/Leaf: 2 Max Leaf 

Model Score: 5826.805654595018 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 2 Max Leaf Nodes: 2
Model Number: 145
Model Score: 6317.280208953411 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 2 Max Leaf Nodes: 3
Model Number: 146
Model Score: 5662.755315526265 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 2 Max Leaf Nodes: 4
Model Number: 147
Model Score: 5474.505508763158 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 2 Max Leaf Nodes: 5
Model Number: 148
Model Score: 5826.805654595018 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 2
Model Number: 149
Model Score: 6890.79928655709 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 3
Model Number: 150
Model Score: 5690.559876081194 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 4
Model Number: 151
Model Score: 5761.654631479516 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 5
Model Number: 152
Model Score: 5826.805654595018 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 4 Ma

Model Score: 5826.805654595018 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 4 Max Leaf Nodes: 2
Model Number: 217
Model Score: 6566.680359464432 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 4 Max Leaf Nodes: 3
Model Number: 218
Model Score: 6631.526583720163 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 4 Max Leaf Nodes: 4
Model Number: 219
Model Score: 6688.866905773544 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 4 Max Leaf Nodes: 5
Model Number: 220
Model Score: 7088.688060326396 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 2
Model Number: 221
Model Score: 5971.5171051189645 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 3
Model Number: 222
Model Score: 5399.436953224333 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 4
Model Number: 223
Model Score: 6628.569084901634 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 5
Model Number: 224
Model Score: 7081.773014690341 Max Depth: 4 Max Feature: 5 Min Samples/Leaf: 2 

In [41]:
best_params = best_params_list[best_index]
print(f"Best Params: Max Depth = {best_params['max_depth']} Max Features = {best_params['max_features']} Max Leaf Nodes = {best_params['max_leaf_nodes']} Min Samples/Leaf = {best_params['min_samples']}")

Best Params: Max Depth = 2 Max Features = 2 Max Leaf Nodes = 2 Min Samples/Leaf = 2


In [42]:
model = RandomForestRegressor().fit(X_train, y_train)
y_pred = model.predict(X_tune)
best_score = mean_absolute_error(y_tune,y_pred)

In [43]:
best_score

1642.9863887223278

In [44]:
#best_score = 0
best_params = dict()
best_params_list = list()
index = 0
best_index = 0
#max_depths = list()
max_depths = [2,3,4,5]
max_features = [2,3,4,5]
min_samples_leaf = [2,3,4,5]
max_leaf_nodes = [2,3,4,5]

print(f"Max Depths: {max_depths}")
print(f"Max Features: {max_features}")

for x in max_features:
    for y in max_depths:
        for a in min_samples_leaf:
            for b in max_leaf_nodes:
                print(f"Model Number: {index}")
                model = RandomForestRegressor(max_features=x,max_depth=y,min_samples_leaf=a,max_leaf_nodes=b).fit(X_train,y_train)
                y_pred = model.predict(X_tune)
                score = mean_absolute_error(y_tune, y_pred)
                print(f"Model Score: {score} Max Depth: {y} Max Feature: {x} Min Samples/Leaf: {a} Max Leaf Nodes: {b}")
                if score < best_score:
                    best_score = score
                    best_index = index
                aa = dict()
                aa['max_depth'] = y
                aa['max_features'] = x
                aa['max_leaf_nodes'] = b
                aa['min_samples'] = a
                best_params_list.append(aa)
    index += 1

Max Depths: [2, 3, 4, 5]
Max Features: [2, 3, 4, 5]
Model Number: 0
Model Score: 6644.093399589005 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 2
Model Number: 0
Model Score: 6346.390501056436 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 3
Model Number: 0
Model Score: 6458.217718425514 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 4
Model Number: 0
Model Score: 6401.785074437036 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 2 Max Leaf Nodes: 5
Model Number: 0
Model Score: 6781.728559266186 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 2
Model Number: 0
Model Score: 6421.974983705731 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 3
Model Number: 0
Model Score: 6379.337830973298 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 4
Model Number: 0
Model Score: 6476.500994732268 Max Depth: 2 Max Feature: 2 Min Samples/Leaf: 3 Max Leaf Nodes: 5
Model Number: 0
Model Score: 6747.6847206186

Model Score: 6540.948113044977 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 2
Model Number: 1
Model Score: 6015.635661254562 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 3
Model Number: 1
Model Score: 6044.5363963384325 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 4
Model Number: 1
Model Score: 6172.453262815203 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 4 Max Leaf Nodes: 5
Model Number: 1
Model Score: 6768.336198468192 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 2
Model Number: 1
Model Score: 6149.54238995424 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 3
Model Number: 1
Model Score: 6100.191539960305 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 4
Model Number: 1
Model Score: 6383.745173003868 Max Depth: 2 Max Feature: 3 Min Samples/Leaf: 5 Max Leaf Nodes: 5
Model Number: 1
Model Score: 6566.630707910005 Max Depth: 3 Max Feature: 3 Min Samples/Leaf: 2 Max Leaf Nodes: 2

Model Score: 5955.469281270959 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 2 Max Leaf Nodes: 3
Model Number: 2
Model Score: 5710.199022611193 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 2 Max Leaf Nodes: 4
Model Number: 2
Model Score: 5623.961014831788 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 2 Max Leaf Nodes: 5
Model Number: 2
Model Score: 6639.930289295942 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 2
Model Number: 2
Model Score: 5882.206913356412 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 3
Model Number: 2
Model Score: 5621.8891429902915 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 4
Model Number: 2
Model Score: 5586.882215974899 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 3 Max Leaf Nodes: 5
Model Number: 2
Model Score: 6497.53449676306 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 4 Max Leaf Nodes: 2
Model Number: 2
Model Score: 5947.2342691255235 Max Depth: 3 Max Feature: 4 Min Samples/Leaf: 4 Max Leaf Nodes: 

Model Score: 5625.657098313749 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 4 Max Leaf Nodes: 4
Model Number: 3
Model Score: 5255.271198966921 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 4 Max Leaf Nodes: 5
Model Number: 3
Model Score: 6434.765661672234 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 2
Model Number: 3
Model Score: 5850.542385820684 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 3
Model Number: 3
Model Score: 5366.875914127459 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 4
Model Number: 3
Model Score: 5317.5687265606875 Max Depth: 3 Max Feature: 5 Min Samples/Leaf: 5 Max Leaf Nodes: 5
Model Number: 3
Model Score: 6298.479052026382 Max Depth: 4 Max Feature: 5 Min Samples/Leaf: 2 Max Leaf Nodes: 2
Model Number: 3
Model Score: 5722.558601435025 Max Depth: 4 Max Feature: 5 Min Samples/Leaf: 2 Max Leaf Nodes: 3
Model Number: 3
Model Score: 5364.661710061968 Max Depth: 4 Max Feature: 5 Min Samples/Leaf: 2 Max Leaf Nodes: 