In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [2]:
cclass=pd.read_csv("../input/used-car-dataset-ford-and-mercedes/cclass.csv")
cclass.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,engineSize
0,C Class,2020,30495,Automatic,1200,Diesel,2.0
1,C Class,2020,29989,Automatic,1000,Petrol,1.5
2,C Class,2020,37899,Automatic,500,Diesel,2.0
3,C Class,2019,30399,Automatic,5000,Diesel,2.0
4,C Class,2019,29899,Automatic,4500,Diesel,2.0


In [3]:
cclass.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
engineSize      float64
dtype: object

In [4]:
cclass.shape

(3899, 7)

In [5]:
cclass.describe

<bound method NDFrame.describe of          model  year  price transmission  mileage fuelType  engineSize
0      C Class  2020  30495    Automatic     1200   Diesel         2.0
1      C Class  2020  29989    Automatic     1000   Petrol         1.5
2      C Class  2020  37899    Automatic      500   Diesel         2.0
3      C Class  2019  30399    Automatic     5000   Diesel         2.0
4      C Class  2019  29899    Automatic     4500   Diesel         2.0
...        ...   ...    ...          ...      ...      ...         ...
3894   C Class  2017  14700       Manual    31357   Diesel         1.6
3895   C Class  2018  18500    Automatic    28248   Diesel         2.1
3896   C Class  2014  11900       Manual    48055   Diesel         2.1
3897   C Class  2014  11300    Automatic    49865   Diesel         2.1
3898   C Class  2014  14800    Automatic    55445   Diesel         2.1

[3899 rows x 7 columns]>

In [6]:
cclass.info

<bound method DataFrame.info of          model  year  price transmission  mileage fuelType  engineSize
0      C Class  2020  30495    Automatic     1200   Diesel         2.0
1      C Class  2020  29989    Automatic     1000   Petrol         1.5
2      C Class  2020  37899    Automatic      500   Diesel         2.0
3      C Class  2019  30399    Automatic     5000   Diesel         2.0
4      C Class  2019  29899    Automatic     4500   Diesel         2.0
...        ...   ...    ...          ...      ...      ...         ...
3894   C Class  2017  14700       Manual    31357   Diesel         1.6
3895   C Class  2018  18500    Automatic    28248   Diesel         2.1
3896   C Class  2014  11900       Manual    48055   Diesel         2.1
3897   C Class  2014  11300    Automatic    49865   Diesel         2.1
3898   C Class  2014  14800    Automatic    55445   Diesel         2.1

[3899 rows x 7 columns]>

In [7]:
cclass.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
engineSize      0
dtype: int64

In [8]:
encoder = LabelEncoder()
cclass['model'] = encoder.fit_transform(cclass['model'])
model_mapping = {index : label for index, label in enumerate(encoder.classes_)}
model_mapping

{0: ' C Class'}

In [9]:
cclass['transmission'] = encoder.fit_transform(cclass['transmission'])
transmission_mapping = {index : label for index, label in enumerate(encoder.classes_)}
transmission_mapping

{0: 'Automatic', 1: 'Manual', 2: 'Other', 3: 'Semi-Auto'}

In [10]:
cclass['fuelType'] = encoder.fit_transform(cclass['fuelType'])
fuelType_mapping = {index : label for index, label in enumerate(encoder.classes_)}
fuelType_mapping

{0: 'Diesel', 1: 'Hybrid', 2: 'Other', 3: 'Petrol'}

In [11]:
x = cclass.drop('price', axis=1)
y = cclass['price']

In [12]:
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
X = scaler.fit_transform(x)

#showing data
print('x \n' , X[:10])
print('y \n' , y[:10])

x 
 [[0.         1.         0.         0.00693068 0.         0.32258065]
 [0.         1.         0.         0.0057746  1.         0.24193548]
 [0.         1.         0.         0.00288441 0.         0.32258065]
 [0.         0.96551724 0.         0.02889612 0.         0.32258065]
 [0.         0.96551724 0.         0.02600593 0.         0.32258065]
 [0.         1.         0.         0.0057746  0.         0.32258065]
 [0.         1.         0.         0.00288441 0.         0.32258065]
 [0.         0.96551724 0.         0.00815612 1.         0.48387097]
 [0.         0.96551724 0.         0.0206244  0.         0.32258065]
 [0.         0.96551724 0.         0.0210059  0.         0.32258065]]
y 
 0    30495
1    29989
2    37899
3    30399
4    29899
5    30999
6    35999
7    37990
8    28990
9    28990
Name: price, dtype: int64


In [13]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=30,random_state=0)

In [14]:
print("x_train shape is:",x_train.shape)

x_train shape is: (3869, 6)


In [15]:
print("x_test shape is:",x_test.shape)

x_test shape is: (30, 6)


In [16]:
print("y_train shape is:",y_train.shape)

y_train shape is: (3869,)


In [17]:
print("y_test shape is:",y_test.shape)

y_test shape is: (30,)


In [18]:
RandomForestRegressorModel = RandomForestRegressor(n_estimators=100,max_depth=9, random_state=33)
RandomForestRegressorModel.fit(x_train, y_train)

print('Random Forest Regressor Train Score is : ' , RandomForestRegressorModel.score(x_train, y_train))
print('Random Forest Regressor Test Score is : ' , RandomForestRegressorModel.score(x_test, y_test))
print('Random Forest Regressor No. of features are : ' , RandomForestRegressorModel.n_features_)
print('----------------------------------------------------')

y_pred = RandomForestRegressorModel.predict(x_test)
print('Predicted Value for Random Forest Regressor is : ' , y_pred[:10])

Random Forest Regressor Train Score is :  0.9382210520516331
Random Forest Regressor Test Score is :  0.940704848157113
Random Forest Regressor No. of features are :  6
----------------------------------------------------
Predicted Value for Random Forest Regressor is :  [35323.13246182 41280.15436236 25525.27861933 28863.8108408
 26934.78623555 17282.22020143 26283.30365959 27302.2854322
 21597.11618416 23441.26405571]


In [19]:
LinearRegressionModel = LinearRegression(fit_intercept=True, normalize=True,copy_X=True,n_jobs=-1)
LinearRegressionModel.fit(x_train, y_train)

print('Linear Regression Train Score is : ' , LinearRegressionModel.score(x_train, y_train))
print('Linear Regression Test Score is : ' , LinearRegressionModel.score(x_test, y_test))
print('Linear Regression Coef is : ' , LinearRegressionModel.coef_)
print('Linear Regression intercept is : ' , LinearRegressionModel.intercept_)
print('----------------------------------------------------')

y_pred = LinearRegressionModel.predict(x_test)
print('Predicted Value for Linear Regression is : ' , y_pred[:10])


Linear Regression Train Score is :  0.7615257786830939
Linear Regression Test Score is :  0.8440894288432877
Linear Regression Coef is :  [ 0.00000000e+00  1.95262417e+03 -5.74673038e+01 -1.29445008e-01
  7.66377664e+02  8.83314599e+03]
Linear Regression intercept is :  -3931295.01298173
----------------------------------------------------
Predicted Value for Linear Regression is :  [30409.08108203 38988.24050822 25016.34781873 26170.7384
 26988.16219029 17367.63973006 25670.30399911 24638.01232682
 24170.08550582 23096.06606543]


In [20]:
DecisionTreeRegressorModel = DecisionTreeRegressor( max_depth=11,random_state=33)
DecisionTreeRegressorModel.fit(x_train, y_train)

print('DecisionTreeRegressor Train Score is : ' , DecisionTreeRegressorModel.score(x_train, y_train))
print('DecisionTreeRegressor Test Score is : ' , DecisionTreeRegressorModel.score(x_test, y_test))
print('----------------------------------------------------')

y_pred = DecisionTreeRegressorModel.predict(x_test)
print('Predicted Value for DecisionTreeRegressorModel is : ' , y_pred[:10])

DecisionTreeRegressor Train Score is :  0.9488810259680093
DecisionTreeRegressor Test Score is :  0.9409890006566614
----------------------------------------------------
Predicted Value for DecisionTreeRegressorModel is :  [30500.         41720.80769231 26594.71428571 26696.
 24914.4        17973.         25975.34615385 27082.44444444
 22153.75       24132.16666667]


In [21]:
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average') # it can be raw_values
print('Mean Squared Error Value is : ', MSEValue)

Mean Squared Error Value is :  6647820.985910559
