In [118]:
%load_ext sql
%sql sqlite:///car_prediction_dataset.sqlite3

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [119]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///car_prediction_dataset.sqlite3')

In [120]:
import pandas as pd
query = """
SELECT 
       year,
       price,
       km_traveled,
       tax,
       enginesize,
       km_per_liters,
       mi.model,
       transmission,
       fuel_type
FROM   models_with_companies mc
       INNER JOIN model_info mi
               ON mc.model_id = mi."index"
WHERE  make = 'Chevrolet'; 


"""

In [121]:
df = pd.read_sql_query(query, con=engine)

In [122]:
df.head()

Unnamed: 0,year,price,km_traveled,tax,engineSize,km_per_liters,model,transmission,fuel_type
0,2018,9491,39842.43038,0,1.5,20.364398,Camaro,Manual,Petrol
1,2002,3750,18158.18322,145,3.0,30.015166,TrailBlazer,Other,Petrol
2,2002,27000,24391.15704,0,1.5,25.040982,Silverado 2500 Extended Cab,Manual,Diesel
3,2005,19200,14405.20234,0,1.8,24.530809,Astro Cargo,Semi-Auto,Hybrid
4,2002,11995,34639.43416,145,1.0,24.658352,S10 Extended Cab,Automatic,Diesel


In [123]:
df.shape

(5101, 9)

In [124]:
# !pip install scikit-learn



You should consider upgrading via the 'F:\TS Analysis\ML_end_end_car_prediction\myenv\Scripts\python.exe -m pip install --upgrade pip' command.





In [125]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [126]:
df.isnull().sum()

year             0
price            0
km_traveled      0
tax              0
engineSize       0
km_per_liters    0
model            0
transmission     0
fuel_type        0
dtype: int64

In [127]:
y = df['price']
X = df.drop('price', axis = 1)

In [128]:
X.shape

(5101, 8)

In [129]:
X.dtypes

year              object
km_traveled      float64
tax                int64
engineSize       float64
km_per_liters    float64
model             object
transmission      object
fuel_type         object
dtype: object

In [130]:
X['model'].value_counts()

Tahoe                                   157
Impala                                  148
Corvette                                137
Suburban 1500                           125
Express 2500 Cargo                      123
                                       ... 
Silverado 1500 LD Double Cab              4
Silverado (Classic) 1500 Regular Cab      3
Caprice                                   3
Tahoe (New)                               3
3500 HD Regular Cab                       2
Name: model, Length: 120, dtype: int64

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1,random_state=1)

In [132]:
X_train.shape, X_test.shape, X_val.shape

((4131, 8), (511, 8), (459, 8))

In [133]:
numeric_col = X.select_dtypes(exclude='object').columns

In [134]:
unodered_col = ['transmission', 'fuel_type']
ordered_col = ['year','model']

In [135]:
print(numeric_col)
print(unodered_col)
print(ordered_col)

Index(['km_traveled', 'tax', 'engineSize', 'km_per_liters'], dtype='object')
['transmission', 'fuel_type']
['year', 'model']


In [136]:
OHE = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OHE.fit_transform(X_train[unodered_col]))
OH_cols_val = pd.DataFrame(OHE.transform(X_val[unodered_col]))
OH_cols_train.index = X_train.index
OH_cols_val.index = X_val.index

In [137]:
Ord_enc = OrdinalEncoder()
Ord_cols_train = pd.DataFrame(Ord_enc.fit_transform(X_train[ordered_col]))
Ord_cols_val =pd.DataFrame(Ord_enc.fit_transform(X_val[ordered_col]))
Ord_cols_train.index = X_train.index
Ord_cols_val.index = X_val.index

In [138]:
sc = StandardScaler()
sc_cols_train = pd.DataFrame(sc.fit_transform(X_train[numeric_col]))
sc_cols_val = pd.DataFrame(sc.fit_transform(X_val[numeric_col]))
sc_cols_train.index = X_train.index
sc_cols_val.index = X_val.index

In [139]:
encoded_X_train = pd.concat([OH_cols_train, Ord_cols_train, sc_cols_train], axis = 1)
encoded_X_val = pd.concat([OH_cols_val, Ord_cols_val, sc_cols_val], axis = 1)

In [140]:
encoded_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,0.1,1.1,0.2,1.2,2.1,3.1
146,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,10.0,14.0,0.563678,0.551531,0.766464,0.368858
4421,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,22.0,41.0,1.207064,-1.286746,0.766464,1.437213
249,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,24.0,41.0,0.107253,0.6877,0.077996,0.827622
4017,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,8.0,13.0,2.081205,0.6877,-1.069449,-0.265871
3414,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,18.0,107.0,-0.804152,-1.286746,-1.069449,-0.341284


In [141]:
def score_dataset(X_train, X_val, y_train,y_val, input_model):
    model = input_model
    model.fit(X_train,y_train)
    preds = model.predict(X_val)
    return np.sqrt(mean_squared_error(y_val, preds))

In [142]:
model = LinearRegression()
score_dataset(encoded_X_train, encoded_X_val, y_train, y_val, model)

6456.234104419897

In [143]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
score_dataset(encoded_X_train, encoded_X_val, y_train, y_val, model)

6579.953137613072

In [144]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
score_dataset(encoded_X_train, encoded_X_val, y_train, y_val, model)

9135.642771566389

In [145]:
from sklearn.svm import SVR
model = SVR()
score_dataset(encoded_X_train, encoded_X_val, y_train, y_val, model)

6754.0772751614395

In [146]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
score_dataset(encoded_X_train, encoded_X_val, y_train, y_val, model)

6518.553729369462

In [147]:
from sklearn.linear_model import ElasticNet
model = ElasticNet()
score_dataset(encoded_X_train, encoded_X_val, y_train, y_val, model)

6453.52705842894

In [148]:
models = [LinearRegression(),RandomForestRegressor(),
         DecisionTreeRegressor(),SVR(),
         GradientBoostingRegressor(),
         ElasticNet()]
for model in models:
    score = score_dataset(encoded_X_train, encoded_X_val, y_train, y_val, model)
    print('RMSE of %s is %.3f'%(model,score))



RMSE of LinearRegression() is 6456.234
RMSE of RandomForestRegressor() is 6606.292
RMSE of DecisionTreeRegressor() is 9382.959
RMSE of SVR() is 6754.077
RMSE of GradientBoostingRegressor() is 6515.593
RMSE of ElasticNet() is 6453.527


In [149]:
elastic_net_parag_grid =  {
                'alpha'     : [0.1,1,10,0.01,0.01,5],
                'l1_ratio'  :  np.arange(0.40,1.00,0.10),
            }
elasticnet_reg = ElasticNet()
elasticnet_reg_grid_search = GridSearchCV(elasticnet_reg, elastic_net_parag_grid,cv=5, n_jobs=-1, verbose=2)
elasticnet_reg_grid_search.fit(encoded_X_train, y_train)


Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [150]:
pred = elasticnet_reg_grid_search.best_estimator_.predict(encoded_X_val)
final_test_score = np.sqrt(mean_squared_error(y_val,pred ))


In [151]:
elasticnet_reg_grid_search.best_estimator_

In [162]:
X = df.copy()

In [163]:
X.head()

Unnamed: 0,year,price,km_traveled,tax,engineSize,km_per_liters,model,transmission,fuel_type
0,2018,9491,39842.43038,0,1.5,20.364398,Camaro,Manual,Petrol
1,2002,3750,18158.18322,145,3.0,30.015166,TrailBlazer,Other,Petrol
2,2002,27000,24391.15704,0,1.5,25.040982,Silverado 2500 Extended Cab,Manual,Diesel
3,2005,19200,14405.20234,0,1.8,24.530809,Astro Cargo,Semi-Auto,Hybrid
4,2002,11995,34639.43416,145,1.0,24.658352,S10 Extended Cab,Automatic,Diesel


In [164]:
y =df['price']
X = df.drop('price', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [168]:
from sklearn import set_config
set_config(display='diagram')

numeric_cols = X.select_dtypes(exclude='object').columns
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

unordered_cols = ['transmission', 'fuel_type']
unordered_tranformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

ordered_cols = ['year', 'model']
ordered_tranformer = Pipeline(steps=[('ordinal',OrdinalEncoder())])

preprocessor = ColumnTransformer(
                transformers=[
                    ('numeric_transformer', numeric_transformer, numeric_cols),
                    ('unordered_tranformer', unordered_tranformer, unordered_cols),
                    ('ordered_tranformer', ordered_tranformer, ordered_cols),
                ])
reg = Pipeline(steps=[('preprocessor',preprocessor),
                     ('regressor', ElasticNet())])



reg.fit(X_train, y_train)

pred = reg.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, pred))

In [170]:
# pipe = Pipeline(reg)
# pipe

In [171]:
param_grid = {
                'regressor__alpha': [0.1,1,10,0.01,0.01,5],
                'regressor__l1_ratio': np.arange(0.40,1.00,0.10),
            }

grid_search = GridSearchCV(reg, param_grid, cv=2)
grid_search

In [173]:
grid_search.fit(X_train, y_train)


Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

Traceback (most recent call last):
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\metrics\_scorer.py", line 429, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\pipeline.py", line 695, in score
    Xt = transform.transform(Xt)
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 746, in transform
    Xs = self._fit_transform(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\sklearn\compose\_column_transformer.py", line 604, in _fit_transform
    return Parallel(n_jobs=self.n_jobs)(
  File "F:\TS Analysis\ML_end_end_car_prediction\myenv\lib\site-packages\joblib\parallel.py", line 1046, in 

In [175]:
pred = grid_search.best_estimator_.predict(X_test)
score = np.sqrt(mean_squared_error(y_test, pred))

In [180]:
# set_config(display='text')
# grid_search.best_estimator_

In [181]:
import joblib

joblib.dump(grid_search.best_estimator_, 'final_car_prediction.pkl')  # saving the model 
joblib.dump(list(X.columns), 'column_name.pkl')  # saving the column names 

['column_name.pkl']

In [182]:
loaded_model = joblib.load('final_car_prediction.pkl')

In [186]:
load_col_names = joblib.load('column_name.pkl')
load_col_names

['year',
 'km_traveled',
 'tax',
 'engineSize',
 'km_per_liters',
 'model',
 'transmission',
 'fuel_type']

In [185]:
loaded_model.predict(pd.DataFrame(data = [['2018', 39842.43038, 0, 1.5, 20.3643976, 'Camaro', 'Manual','Petrol']],
                                  columns = joblib.load('column_name.pkl')))

array([12693.99485813])