## Model Training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/CAR DETAILS FROM CAR DEKHO.csv")
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [3]:
df = df.drop(labels=['name'],axis=1)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


#### Independent and dependent features

In [31]:
X = df.drop(labels=['selling_price'],axis=1)
Y = df['selling_price']

#### Define which columns should be ordinal-encoded and which should be scaled

In [32]:
categorical_columns = X.select_dtypes(include="object").columns
numerical_columns = X.select_dtypes(exclude="object").columns


#### Define the custom ranking for each ordinal variable

In [34]:
fuel_categories = ["Petrol","Diesel","CNG","LPG","Electric"]
seller_categories = ['Individual', 'Dealer', 'Trustmark Dealer']
transmission_categories = ['Manual', 'Automatic']
owner_categories = ['First Owner', 'Second Owner', 'Fourth & Above Owner','Third Owner', 'Test Drive Car']


In [35]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder

In [36]:
num_pipeline = Pipeline(
    steps=[
            ('imputer',SimpleImputer(strategy='median')),
            ('scaler',StandardScaler())
    ]
)

In [42]:
cat_pipeline = Pipeline(
    steps=[
            ('imputer',SimpleImputer(strategy='most_frequent')),
            ('ordinalencoder',OrdinalEncoder(categories=[fuel_categories,seller_categories,transmission_categories,owner_categories])),
            ('scaler',StandardScaler())
    ]
)

In [43]:
preprocessor = ColumnTransformer([
('num_pipeline',num_pipeline,numerical_columns),
("cat_pipeline",cat_pipeline,categorical_columns)
])

#### Train Test Split

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [46]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [47]:
X_train.head()

Unnamed: 0,num_pipeline__year,num_pipeline__km_driven,cat_pipeline__fuel,cat_pipeline__seller_type,cat_pipeline__transmission,cat_pipeline__owner
0,1.173176,0.274316,0.834659,-0.549624,-0.341323,-0.600183
1,0.455088,0.063098,0.834659,-0.549624,-0.341323,0.525345
2,-1.459812,-0.317095,-0.977846,-0.549624,-0.341323,0.525345
3,0.215726,1.668356,0.834659,-0.549624,-0.341323,-0.600183
4,-0.262999,0.907971,0.834659,-0.549624,-0.341323,-0.600183


#### Model Training

In [48]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [49]:
regression=LinearRegression()
regression.fit(X_train,y_train)

In [50]:
regression.coef_

array([162880.87796658, -27458.66524935, 112130.61677212,  46448.1533605 ,
       262329.64062214, -10143.85056073])

In [51]:
regression.intercept_

505564.67774852004

In [52]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

#### Train multiple models

In [53]:


models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print("***",list(models.keys())[i],'***')
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

*** LinearRegression ***
Model Training Performance
RMSE: 460271.4339932791
MAE: 236960.48297012737
R2 score 42.24189292583564


*** Lasso ***
Model Training Performance
RMSE: 460271.522257743
MAE: 236960.22597332887
R2 score 42.241870773738135


*** Ridge ***
Model Training Performance
RMSE: 460276.638170407
MAE: 236945.65139021302
R2 score 42.24058680472148


*** Elasticnet ***
Model Training Performance
RMSE: 477301.7752559639
MAE: 234566.69037048097
R2 score 37.888644542876946


