# Steps to train a supervsied machine learning model
 - Load and preprocess dataset 
    - Choose the columns tha you will need for training the model (X and Y)
    - Set the correct data types if not
    - Encode the columns where necessary (ML algorithms can only work with numbers)
    - Transform/ Scale columns where necessary
 - Check X and Y variables
 - Split your data into training data and testing data (75-25 ratio/80-20)
 - Choose 3 most appropriate ML models 
 - For each model, train and test
 - For the best performing mode, tune the model
 - Make a sample prediction to test the model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

from sklearn.model_selection import train_test_split


In [2]:
df=pd.read_csv('car_prices.csv')
df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)


In [3]:
#some checks
df['state'].unique()

array(['ca', 'tx', 'pa', 'mn', 'az', 'wi', 'tn', 'md', 'fl', 'ne', 'nj',
       'nv', 'oh', 'mi', 'ga', 'va', 'sc', 'nc', 'in', 'il', 'co', 'ut',
       'mo', 'ny', 'ma', 'pr', 'or', 'la', 'wa', 'hi', 'qc', 'ab', 'on',
       'ok', 'ms', 'nm', 'al', '3vwd17aj4fm201708', 'ns',
       '3vwd17aj2fm258506', '3vwd17aj3fm276741', '3vwd17aj2fm285365',
       '3vwd17aj0fm227318', '3vwd17aj6fm218641', '3vwd17aj7fm223475',
       '3vwd17aj5fm297123', '3vwd17aj5fm219943', '3vwd17aj9fm219766',
       '3vwd17aj3fm259017', '3vwd17aj5fm206111', '3vwd17aj5fm273601',
       '3vwd17aj5fm221322', '3vwd17aj5fm268964', '3vwd17aj6fm231972',
       '3vwd17aj7fm222388', '3vwd17aj7fm218440', '3vwd17ajxfm315938',
       '3vwd17aj7fm229552', '3vwd17aj8fm298895', '3vwd17aj4fm236636',
       '3vwd17aj5fm225953', '3vwd17aj7fm326640', '3vwd17aj8fm239622',
       '3vwd17aj2fm261566'], dtype=object)

In [4]:
#filter states
df2=df[~df['state'].isin(['3vwd17aj4fm201708', 'ns',
       '3vwd17aj2fm258506', '3vwd17aj3fm276741', '3vwd17aj2fm285365',
       '3vwd17aj0fm227318', '3vwd17aj6fm218641', '3vwd17aj7fm223475',
       '3vwd17aj5fm297123', '3vwd17aj5fm219943', '3vwd17aj9fm219766',
       '3vwd17aj3fm259017', '3vwd17aj5fm206111', '3vwd17aj5fm273601',
       '3vwd17aj5fm221322', '3vwd17aj5fm268964', '3vwd17aj6fm231972',
       '3vwd17aj7fm222388', '3vwd17aj7fm218440', '3vwd17ajxfm315938',
       '3vwd17aj7fm229552', '3vwd17aj8fm298895', '3vwd17aj4fm236636',
       '3vwd17aj5fm225953', '3vwd17aj7fm326640', '3vwd17aj8fm239622',
       '3vwd17aj2fm261566'])]
df2['state'].unique()

array(['ca', 'tx', 'pa', 'mn', 'az', 'wi', 'tn', 'md', 'fl', 'ne', 'nj',
       'nv', 'oh', 'mi', 'ga', 'va', 'sc', 'nc', 'in', 'il', 'co', 'ut',
       'mo', 'ny', 'ma', 'pr', 'or', 'la', 'wa', 'hi', 'qc', 'ab', 'on',
       'ok', 'ms', 'nm', 'al'], dtype=object)

In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558750 entries, 0 to 558836
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          558750 non-null  int64  
 1   make          548464 non-null  object 
 2   model         548366 non-null  object 
 3   trim          548114 non-null  object 
 4   body          545570 non-null  object 
 5   transmission  493459 non-null  object 
 6   vin           558750 non-null  object 
 7   state         558750 non-null  object 
 8   condition     546956 non-null  float64
 9   odometer      558656 non-null  float64
 10  color         558001 non-null  object 
 11  interior      558001 non-null  object 
 12  seller        558750 non-null  object 
 13  mmr           558738 non-null  float64
 14  sellingprice  558738 non-null  float64
 15  saledate      558738 non-null  object 
dtypes: float64(4), int64(1), object(11)
memory usage: 72.5+ MB


In [6]:
df['condition'].unique()

array([ 5., 45., 41., 43.,  1., 34.,  2., 42.,  3., 48., nan, 49., 17.,
       19., 29., 38., 44., 47., 32.,  4., 25., 37., 39., 31., 28., 46.,
       36., 35., 26., 21., 22., 27., 24., 33., 23., 15., 16., 18., 12.,
       14., 11., 13.])

In [7]:
#keep only columns that we need
df2=df2.drop(['vin','seller','saledate','condition'],axis=1)


In [8]:
df2.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,odometer,color,interior,mmr,sellingprice
0,2015,Kia,Sorento,LX,SUV,automatic,ca,16639.0,white,black,20500.0,21500.0
1,2015,Kia,Sorento,LX,SUV,automatic,ca,9393.0,white,beige,20800.0,21500.0
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,ca,1331.0,gray,black,31900.0,30000.0
3,2015,Volvo,S60,T5,Sedan,automatic,ca,14282.0,white,black,27500.0,27750.0
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,ca,2641.0,gray,black,66000.0,67000.0


In [9]:
#dropped null values
df2.isna().sum()

year                0
make            10286
model           10384
trim            10636
body            13180
transmission    65291
state               0
odometer           94
color             749
interior          749
mmr                12
sellingprice       12
dtype: int64

In [10]:
df2.dropna(inplace=True)

In [11]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 481693 entries, 0 to 558836
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          481693 non-null  int64  
 1   make          481693 non-null  object 
 2   model         481693 non-null  object 
 3   trim          481693 non-null  object 
 4   body          481693 non-null  object 
 5   transmission  481693 non-null  object 
 6   state         481693 non-null  object 
 7   odometer      481693 non-null  float64
 8   color         481693 non-null  object 
 9   interior      481693 non-null  object 
 10  mmr           481693 non-null  float64
 11  sellingprice  481693 non-null  float64
dtypes: float64(3), int64(1), object(8)
memory usage: 47.8+ MB


In [12]:
df2.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,odometer,color,interior,mmr,sellingprice
0,2015,Kia,Sorento,LX,SUV,automatic,ca,16639.0,white,black,20500.0,21500.0
1,2015,Kia,Sorento,LX,SUV,automatic,ca,9393.0,white,beige,20800.0,21500.0
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,ca,1331.0,gray,black,31900.0,30000.0
3,2015,Volvo,S60,T5,Sedan,automatic,ca,14282.0,white,black,27500.0,27750.0
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,ca,2641.0,gray,black,66000.0,67000.0


In [13]:
df3=df2.drop(['mmr'],axis=1)
df3.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,odometer,color,interior,sellingprice
0,2015,Kia,Sorento,LX,SUV,automatic,ca,16639.0,white,black,21500.0
1,2015,Kia,Sorento,LX,SUV,automatic,ca,9393.0,white,beige,21500.0
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,ca,1331.0,gray,black,30000.0
3,2015,Volvo,S60,T5,Sedan,automatic,ca,14282.0,white,black,27750.0
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,ca,2641.0,gray,black,67000.0


In [14]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 481693 entries, 0 to 558836
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          481693 non-null  int64  
 1   make          481693 non-null  object 
 2   model         481693 non-null  object 
 3   trim          481693 non-null  object 
 4   body          481693 non-null  object 
 5   transmission  481693 non-null  object 
 6   state         481693 non-null  object 
 7   odometer      481693 non-null  float64
 8   color         481693 non-null  object 
 9   interior      481693 non-null  object 
 10  mmr           481693 non-null  float64
 11  sellingprice  481693 non-null  float64
dtypes: float64(3), int64(1), object(8)
memory usage: 47.8+ MB


In [15]:
#encode categorical variables
# to encode: make	model	trim	body	transmission	state	color	interior	

from sklearn.preprocessing import LabelEncoder

make_encoder=LabelEncoder()
model_encoder=LabelEncoder()
trim_encoder=LabelEncoder()
body_encoder=LabelEncoder()
transmission_encoder=LabelEncoder()
state_encoder=LabelEncoder()
color_encoder=LabelEncoder()
interior_encoder=LabelEncoder()

df3['make']=make_encoder.fit_transform(df3['make'])
df3['model']=model_encoder.fit_transform(df3['model'])
df3['trim']=trim_encoder.fit_transform(df3['trim'])
df3['body']=body_encoder.fit_transform(df3['body'])
df3['transmission']=transmission_encoder.fit_transform(df3['transmission'])
df3['state']=state_encoder.fit_transform(df3['state'])
df3['color']=color_encoder.fit_transform(df3['color'])
df3['interior']=interior_encoder.fit_transform(df3['interior'])

df3.head()



Unnamed: 0,year,make,model,trim,body,transmission,state,odometer,color,interior,sellingprice
0,2015,24,640,822,35,0,2,16639.0,17,1,21500.0
1,2015,24,640,822,35,0,2,9393.0,17,0,21500.0
2,2014,3,8,253,36,0,2,1331.0,7,1,30000.0
3,2015,51,578,1217,36,0,2,14282.0,17,1,27750.0
4,2014,3,33,335,36,0,2,2641.0,7,1,67000.0


In [16]:
#check encoder properties
print(make_encoder.classes_)
#to get back encoded values to original
print(make_encoder.classes_[3]) # this hsould give bmw

['Acura' 'Aston Martin' 'Audi' 'BMW' 'Bentley' 'Buick' 'Cadillac'
 'Chevrolet' 'Chrysler' 'Daewoo' 'Dodge' 'FIAT' 'Ferrari' 'Fisker' 'Ford'
 'GMC' 'Geo' 'HUMMER' 'Honda' 'Hyundai' 'Infiniti' 'Isuzu' 'Jaguar' 'Jeep'
 'Kia' 'Lamborghini' 'Land Rover' 'Lexus' 'Lincoln' 'Lotus' 'MINI'
 'Maserati' 'Mazda' 'Mercedes-Benz' 'Mercury' 'Mitsubishi' 'Nissan'
 'Oldsmobile' 'Plymouth' 'Pontiac' 'Porsche' 'Ram' 'Rolls-Royce' 'Saab'
 'Saturn' 'Scion' 'Subaru' 'Suzuki' 'Tesla' 'Toyota' 'Volkswagen' 'Volvo'
 'smart']
BMW


In [17]:
#scale some columns such as odometer (Minmax or standard scaler) - preferred is minmax scaler
from sklearn.preprocessing import MinMaxScaler
odometer_scaler=MinMaxScaler()
df3['odometer']=odometer_scaler.fit_transform(df3[['odometer']])
df3.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,odometer,color,interior,sellingprice
0,2015,24,640,822,35,0,2,0.016638,17,1,21500.0
1,2015,24,640,822,35,0,2,0.009392,17,0,21500.0
2,2014,3,8,253,36,0,2,0.00133,7,1,30000.0
3,2015,51,578,1217,36,0,2,0.014281,17,1,27750.0
4,2014,3,33,335,36,0,2,0.00264,7,1,67000.0


In [18]:
Y=df3['sellingprice']
X=df3.drop(['sellingprice'],axis=1)

In [19]:
X.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,odometer,color,interior
0,2015,24,640,822,35,0,2,0.016638,17,1
1,2015,24,640,822,35,0,2,0.009392,17,0
2,2014,3,8,253,36,0,2,0.00133,7,1
3,2015,51,578,1217,36,0,2,0.014281,17,1
4,2014,3,33,335,36,0,2,0.00264,7,1


In [20]:
Y.head()

0    21500.0
1    21500.0
2    30000.0
3    27750.0
4    67000.0
Name: sellingprice, dtype: float64

In [21]:
#Split data into train and test
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=42)

In [22]:
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(385354, 10) (96339, 10) (385354,) (96339,)


In [23]:
#choose 3 appropriate regression models 
# linear regression, ridge regression, support vector regression

In [24]:
#train linear regression model
from sklearn.linear_model import LinearRegression

lr_model=LinearRegression()
lr_model.fit(X_train,Y_train) #training process

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [25]:
#evaliate linear regression model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, root_mean_squared_error

y_pred=lr_model.predict(X_test)


#evaliate metrics
mse=mean_squared_error(Y_test,y_pred)
rmse=root_mean_squared_error(Y_test,y_pred)
mae=mean_absolute_error(Y_test,y_pred)
r2=r2_score(Y_test,y_pred)

print('LinearRegression Evaluation Metrics:')
print('MSE:',mse)
print('RMSE:',rmse)
print('MAE:',mae)
print('R2:',r2)

lr_model.score(X_test,Y_test) #r2 score

LinearRegression Evaluation Metrics:
MSE: 55405069.80933529
RMSE: 7443.458188861901
MAE: 5141.061213964768
R2: 0.39495176583451996


0.39495176583451996

In [26]:
#ridge regression
from sklearn.linear_model import Ridge
ridge_model=Ridge(alpha=1.0)
ridge_model.fit(X_train,Y_train)
y_pred_ridge=ridge_model.predict(X_test)

mse_ridge=mean_squared_error(Y_test,y_pred_ridge)
rmse_ridge=root_mean_squared_error(Y_test,y_pred_ridge)
mae_ridge=mean_absolute_error(Y_test,y_pred_ridge)
r2_ridge=r2_score(Y_test,y_pred_ridge)

print('Ridge Regression Evaluation Metrics:')
print('MSE:',mse_ridge)
print('RMSE:',rmse_ridge)
print('MAE:',mae_ridge)
print('R2:',r2_ridge)


Ridge Regression Evaluation Metrics:
MSE: 55405052.89009199
RMSE: 7443.457052344159
MAE: 5140.984976950029
R2: 0.394951950600252


In [27]:
# # support vector regression
# from sklearn.svm import SVR
# svr_model=SVR(kernel='rbf')
# svr_model.fit(X_train,Y_train)
# y_pred_svr=svr_model.predict(X_test)
# mse_svr=mean_squared_error(Y_test,y_pred_svr)
# rmse_svr=root_mean_squared_error(Y_test,y_pred_svr)
# mae_svr=mean_absolute_error(Y_test,y_pred_svr)
# r2_svr=r2_score(Y_test,y_pred_svr)

# print('Support Vector Regression Evaluation Metrics:')
# print('MSE:',mse_svr)
# print('RMSE:',rmse_svr)
# print('MAE:',mae_svr)
# print('R2:',r2_svr)


In [28]:
#tune ridge regression model using grid search cv
from sklearn.model_selection import GridSearchCV

grid={
    'alpha':[1,10],
    # 'solver':['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
    'solver':['saga', 'lbfgs']
}

ridge=Ridge()
grid_search=GridSearchCV(estimator=ridge,param_grid=grid,cv=5)
grid_search.fit(X_train,Y_train)

print('Best Hyperparameters:',grid_search.best_params_)

#print best model evaluation metrics
best_ridge_model=grid_search.best_estimator_
y_pred_best_ridge=best_ridge_model.predict(X_test)
mse_best_ridge=mean_squared_error(Y_test,y_pred_best_ridge)
rmse_best_ridge=root_mean_squared_error(Y_test,y_pred_best_ridge)
mae_best_ridge=mean_absolute_error(Y_test,y_pred_best_ridge)
r2_best_ridge=r2_score(Y_test,y_pred_best_ridge)

print('Best Ridge Regression Evaluation Metrics:')
print('MSE:',mse_best_ridge)
print('RMSE:',rmse_best_ridge)
print('MAE:',mae_best_ridge)
print('R2:',r2_best_ridge)


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nirmal/.pyenv/versions/3.12.2/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nirmal/.pyenv/versions/3.12.2/lib/python3.12/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/nirmal/.pyenv/versions/3.12.2/lib/python3.12/site-packages/sklearn/linear_model/_ridge.py", line 1248, in fit
    return super().fit(X, y, sample_weight=sample_weig

Best Hyperparameters: {'alpha': 1, 'solver': 'saga'}
Best Ridge Regression Evaluation Metrics:
MSE: 57565262.539135605
RMSE: 7587.177508081355
MAE: 5266.640358411908
R2: 0.3713614914946334




In [29]:
#predict price for a new car

new_car=np.array([[2015,24,640,822,35,0,2,0.016638,17,1]])
new_car_prediction=best_ridge_model.predict(new_car)
print("Prediction for new car:",new_car_prediction[0])

Prediction for new car: 21235.567380342167


