In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error


In [2]:
merc=pd.read_csv("../input/used-car-dataset-ford-and-mercedes/merc.csv")
merc.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,SLK,2005,5200,Automatic,63000,Petrol,325,32.1,1.8
1,S Class,2017,34948,Automatic,27000,Hybrid,20,61.4,2.1
2,SL CLASS,2016,49948,Automatic,6200,Petrol,555,28.0,5.5
3,G Class,2016,61948,Automatic,16000,Petrol,325,30.4,4.0
4,G Class,2016,73948,Automatic,4000,Petrol,325,30.1,4.0


In [3]:
merc.shape

(13119, 9)

In [4]:
merc.dtypes

model            object
year              int64
price             int64
transmission     object
mileage           int64
fuelType         object
tax               int64
mpg             float64
engineSize      float64
dtype: object

In [5]:
merc.describe

<bound method NDFrame.describe of             model  year  price transmission  mileage fuelType  tax   mpg  \
0             SLK  2005   5200    Automatic    63000   Petrol  325  32.1   
1         S Class  2017  34948    Automatic    27000   Hybrid   20  61.4   
2        SL CLASS  2016  49948    Automatic     6200   Petrol  555  28.0   
3         G Class  2016  61948    Automatic    16000   Petrol  325  30.4   
4         G Class  2016  73948    Automatic     4000   Petrol  325  30.1   
...           ...   ...    ...          ...      ...      ...  ...   ...   
13114     C Class  2020  35999    Automatic      500   Diesel  145  55.4   
13115     B Class  2020  24699    Automatic     2500   Diesel  145  55.4   
13116   GLC Class  2019  30999    Automatic    11612   Diesel  145  41.5   
13117   CLS Class  2019  37990    Automatic     2426   Diesel  145  45.6   
13118     S Class  2019  54999    Automatic     2075   Diesel  145  52.3   

       engineSize  
0             1.8  
1            

In [6]:
merc.info

<bound method DataFrame.info of             model  year  price transmission  mileage fuelType  tax   mpg  \
0             SLK  2005   5200    Automatic    63000   Petrol  325  32.1   
1         S Class  2017  34948    Automatic    27000   Hybrid   20  61.4   
2        SL CLASS  2016  49948    Automatic     6200   Petrol  555  28.0   
3         G Class  2016  61948    Automatic    16000   Petrol  325  30.4   
4         G Class  2016  73948    Automatic     4000   Petrol  325  30.1   
...           ...   ...    ...          ...      ...      ...  ...   ...   
13114     C Class  2020  35999    Automatic      500   Diesel  145  55.4   
13115     B Class  2020  24699    Automatic     2500   Diesel  145  55.4   
13116   GLC Class  2019  30999    Automatic    11612   Diesel  145  41.5   
13117   CLS Class  2019  37990    Automatic     2426   Diesel  145  45.6   
13118     S Class  2019  54999    Automatic     2075   Diesel  145  52.3   

       engineSize  
0             1.8  
1             2

In [7]:
encoder = LabelEncoder()
merc['model'] = encoder.fit_transform(merc['model'])
model_mapping = {index : label for index, label in enumerate(encoder.classes_)}
model_mapping

{0: ' A Class',
 1: ' B Class',
 2: ' C Class',
 3: ' CL Class',
 4: ' CLA Class',
 5: ' CLC Class',
 6: ' CLK',
 7: ' CLS Class',
 8: ' E Class',
 9: ' G Class',
 10: ' GL Class',
 11: ' GLA Class',
 12: ' GLB Class',
 13: ' GLC Class',
 14: ' GLE Class',
 15: ' GLS Class',
 16: ' M Class',
 17: ' R Class',
 18: ' S Class',
 19: ' SL CLASS',
 20: ' SLK',
 21: ' V Class',
 22: ' X-CLASS',
 23: '180',
 24: '200',
 25: '220',
 26: '230'}

In [8]:
merc['transmission'] = encoder.fit_transform(merc['transmission'])
transmission_mapping = {index : label for index, label in enumerate(encoder.classes_)}
transmission_mapping

{0: 'Automatic', 1: 'Manual', 2: 'Other', 3: 'Semi-Auto'}

In [9]:
merc['fuelType'] = encoder.fit_transform(merc['fuelType'])
fuelType_mapping = {index : label for index, label in enumerate(encoder.classes_)}
fuelType_mapping

{0: 'Diesel', 1: 'Hybrid', 2: 'Other', 3: 'Petrol'}

In [10]:
x = merc.drop('price', axis=1)
y = merc['price']

In [11]:
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
X = scaler.fit_transform(x)

#showing data
print('x \n' , X[:10])
print('y \n' , y[:10])

x 
 [[0.76923077 0.7        0.         0.24324032 1.         0.56034483
  0.14338575 0.29032258]
 [0.69230769 0.94       0.         0.10424365 0.33333333 0.03448276
  0.27890842 0.33870968]
 [0.73076923 0.92       0.         0.02393446 1.         0.95689655
  0.12442183 0.88709677]
 [0.34615385 0.92       0.         0.06177244 1.         0.56034483
  0.13552266 0.64516129]
 [0.34615385 0.92       0.         0.01544021 1.         0.56034483
  0.13413506 0.64516129]
 [0.73076923 0.82       0.         0.0115792  1.         0.98275862
  0.09389454 1.        ]
 [0.53846154 0.96       0.         0.06177244 0.         0.25
  0.21646623 0.33870968]
 [0.69230769 0.84       0.         0.41312515 1.         0.45689655
  0.16466235 0.56451613]
 [0.34615385 0.98       0.         0.04632836 1.         0.25
  0.09389454 0.64516129]
 [0.42307692 0.94       0.         0.05890756 0.         0.05172414
  0.29185939 0.33870968]]
y 
 0      5200
1     34948
2     49948
3     61948
4     73948
5    149948
6

In [12]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=30,random_state=0)

In [13]:
print("x_train shape is:",x_train.shape)

x_train shape is: (13089, 8)


In [14]:
print("x_test shape is:",x_test.shape)


x_test shape is: (30, 8)


In [15]:
print("y_train shape is:",y_train.shape)


y_train shape is: (13089,)


In [16]:
print("y_test shape is:",y_test.shape)


y_test shape is: (30,)


In [17]:
RandomForestRegressorModel = RandomForestRegressor(n_estimators=100,max_depth=8, random_state=33)
RandomForestRegressorModel.fit(x_train, y_train)

print('Random Forest Regressor Train Score is : ' , RandomForestRegressorModel.score(x_train, y_train))
print('Random Forest Regressor Test Score is : ' , RandomForestRegressorModel.score(x_test, y_test))
print('Random Forest Regressor No. of features are : ' , RandomForestRegressorModel.n_features_)
print('----------------------------------------------------')

y_pred = RandomForestRegressorModel.predict(x_test)
print('Predicted Value for Random Forest Regressor is : ' , y_pred[:10])

Random Forest Regressor Train Score is :  0.9507658163107037
Random Forest Regressor Test Score is :  0.913195119154258
Random Forest Regressor No. of features are :  8
----------------------------------------------------
Predicted Value for Random Forest Regressor is :  [21455.51562599 13266.0881097  34459.59883034 40083.97499224
 20690.63632851 17546.24938285 44605.16698856 24719.72149106
 20269.35837227 19062.24818446]


In [18]:
DecisionTreeRegressorModel = DecisionTreeRegressor( max_depth=13,random_state=33)
DecisionTreeRegressorModel.fit(x_train, y_train)

print('DecisionTreeRegressor Train Score is : ' , DecisionTreeRegressorModel.score(x_train, y_train))
print('DecisionTreeRegressor Test Score is : ' , DecisionTreeRegressorModel.score(x_test, y_test))
print('----------------------------------------------------')

y_pred = DecisionTreeRegressorModel.predict(x_test)
print('Predicted Value for DecisionTreeRegressorModel is : ' , y_pred[:10])

DecisionTreeRegressor Train Score is :  0.9819040143124543
DecisionTreeRegressor Test Score is :  0.9114634138245683
----------------------------------------------------
Predicted Value for DecisionTreeRegressorModel is :  [19845.0625     12785.         38300.66666667 40320.375
 20783.77570093 17391.76666667 45235.77272727 22544.6
 19797.5        20386.4893617 ]


In [19]:
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average') # it can be raw_values
print('Mean Squared Error Value is : ', MSEValue)

Mean Squared Error Value is :  9463703.049079943
