In [172]:
#Importing needed libaries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression

In [173]:
#Reading our csv file as a pandas dataframe
used_cars = pd.read_csv('Used_cars.csv')

In [174]:
used_cars.describe()

Unnamed: 0.1,Unnamed: 0,Id,year,price,distance_travelled(kms),brand_rank,car_age,distance below 30k km,new and less used,inv_car_price,inv_car_dist,inv_car_age,inv_brand,std_invprice,std_invdistance_travelled,std_invrank,best_buy1,best_buy2
count,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0,1725.0
mean,862.0,862.0,2015.390725,1494837.0,53848.256232,15.731014,5.609275,0.269565,0.209275,1.416237e-06,4.1e-05,inf,0.18781,0.084623,0.013809,0.177658,88.962902,32.537208
std,498.108924,498.108924,3.207504,1671658.0,44725.541963,12.951122,3.207504,0.443863,0.406909,1.291449e-06,0.00011,,0.254849,0.08106,0.038689,0.258034,188.95069,158.662274
min,0.0,0.0,1990.0,62500.0,350.0,1.0,0.0,0.0,0.0,6.802721e-08,1e-06,0.032258,0.012346,0.0,0.0,0.0,0.0,0.0
25%,431.0,431.0,2013.0,545000.0,29000.0,5.0,3.0,0.0,0.0,5.479452e-07,1.4e-05,0.125,0.041667,0.030123,0.004524,0.029687,14.237358,0.0
50%,862.0,862.0,2016.0,875000.0,49000.0,14.0,5.0,0.0,0.0,1.142857e-06,2e-05,0.2,0.071429,0.067464,0.006703,0.059821,36.716166,0.0
75%,1293.0,1293.0,2018.0,1825000.0,70500.0,24.0,8.0,1.0,0.0,1.834862e-06,3.4e-05,0.333333,0.2,0.110899,0.011631,0.19,90.776658,0.0
max,1724.0,1724.0,2021.0,14700000.0,790000.0,81.0,31.0,1.0,1.0,1.6e-05,0.002857,inf,1.0,1.0,1.0,1.0,2477.51764,2477.51764


In [175]:
#Visualising columns correlation with the price target column
used_cars.corr()['price']

Unnamed: 0                  -0.105696
Id                          -0.105696
year                         0.288483
price                        1.000000
distance_travelled(kms)     -0.137351
brand_rank                  -0.164591
car_age                     -0.288483
distance below 30k km        0.212197
new and less used            0.219786
inv_car_price               -0.517723
inv_car_dist                 0.081735
inv_car_age                  0.267973
inv_brand                    0.185660
std_invprice                -0.517723
std_invdistance_travelled    0.081735
std_invrank                  0.185660
best_buy1                   -0.106855
best_buy2                    0.008077
Name: price, dtype: float64

In [176]:
#Creating a new dataframe as a copy of the original one with which we will work with
df = used_cars.copy()

In [177]:
#Getting rid of columns with less than 0.20 correlation
df.drop(['Id','Unnamed: 0', 'distance_travelled(kms)', 'brand_rank', 'inv_car_dist', 'inv_brand', 'best_buy2',
           'best_buy1', 'std_invrank', 'std_invdistance_travelled'], axis=1, inplace=True)

In [178]:
#Visualising columns correlation with the price target column
df.corr()['price']

year                     0.288483
price                    1.000000
car_age                 -0.288483
distance below 30k km    0.212197
new and less used        0.219786
inv_car_price           -0.517723
inv_car_age              0.267973
std_invprice            -0.517723
Name: price, dtype: float64

In [179]:
#Deleting another two columns with 0.21 correlation
df.drop(['distance below 30k km', 'new and less used'], axis=1, inplace=True)

In [180]:
df.head()

Unnamed: 0,year,brand,full_model_name,model_name,price,fuel_type,city,car_age,inv_car_price,inv_car_age,std_invprice
0,2016,Honda,Honda Brio S MT,Brio,425000.0,Petrol,Mumbai,5.0,2.352941e-06,0.2,0.143417
1,2012,Nissan,Nissan Sunny XV Diesel,Sunny,325000.0,Diesel,Mumbai,9.0,3.076923e-06,0.111111,0.188859
2,2017,Toyota,Toyota Fortuner 2.8 4x2 MT [2016-2020],Fortuner,2650000.0,Diesel,Thane,4.0,3.773585e-07,0.25,0.019416
3,2017,Mercedes-Benz,Mercedes-Benz E-Class E 220d Expression [2019-...,E-Class,4195000.0,Diesel,Mumbai,4.0,2.38379e-07,0.25,0.010692
4,2012,Hyundai,Hyundai Verna Fluidic 1.6 CRDi SX,Verna,475000.0,Diesel,Mumbai,9.0,2.105263e-06,0.111111,0.127871


In [181]:
#The fuel_type column has the least number of different values
df['fuel_type'].value_counts()

Diesel        922
Petrol        788
CNG + 1         8
Petrol + 1      6
Hybrid          1
Name: fuel_type, dtype: int64

In [182]:
#Getting dummies for fuel_type column
df = pd.get_dummies(df, columns=['fuel_type'], drop_first = False )

In [183]:
#Visualising columns correlation with the price target column
df.corr()['price']

year                    0.288483
price                   1.000000
car_age                -0.288483
inv_car_price          -0.517723
inv_car_age             0.267973
std_invprice           -0.517723
fuel_type_CNG + 1      -0.043825
fuel_type_Diesel        0.269330
fuel_type_Hybrid        0.008721
fuel_type_Petrol       -0.260109
fuel_type_Petrol + 1   -0.034109
Name: price, dtype: float64

In [184]:
#Deleting 3 dummies columns because they have the least correlation with target column
df.drop(['fuel_type_Petrol + 1', 'fuel_type_Hybrid', 'fuel_type_CNG + 1'], axis=1, inplace=True)

In [185]:
#Visualising columns correlation with the price target column
df.corr()['price']

year                0.288483
price               1.000000
car_age            -0.288483
inv_car_price      -0.517723
inv_car_age         0.267973
std_invprice       -0.517723
fuel_type_Diesel    0.269330
fuel_type_Petrol   -0.260109
Name: price, dtype: float64

In [186]:
#Deleting columns with text that wont help us
df.drop(['brand', 'full_model_name', 'model_name', 'city'], axis=1, inplace=True)

In [187]:
#Visualising all of our columns
df.columns

Index(['year', 'price', 'car_age', 'inv_car_price', 'inv_car_age',
       'std_invprice', 'fuel_type_Diesel', 'fuel_type_Petrol'],
      dtype='object')

In [188]:
#We dont have any NaN values
df.isnull().values.sum()

0

## Linear Regression with the year column

In [189]:
X = df['year']
y = df['price']

In [190]:
#Storing the values in a numpy array and reshaping it so that we can work with it
X = np.array(X)
X = X.reshape(-1, 1)

In [191]:
#Splitting the data in train and test with train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [192]:
#Create an instance of the Linear Regression class from sklearn.
model = LinearRegression()

In [193]:
#Call the fit function , which applies the Linear Regression model
model.fit( X_train, y_train)

LinearRegression()

In [194]:
#Define a variable that stores the predictions of our model .
y_pred = model.predict(X_test)
#Print the mean squared error and mean absolute error to get an idea of how well the model has done .
mean_squared_error(y_pred, y_test), mean_absolute_error(y_pred, y_test)

(2430277649702.0493, 1034106.735078442)

## Linear Regression with inv_car_price - the column with 0.51 correlation

In [195]:
X = df['inv_car_price']
y = df['price']

In [196]:
#Storing the values in a numpy array and reshaping it so that we can work with it
X = np.array(X)
X = X.reshape(-1, 1)

In [197]:
#Splitting the data in train and test with train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [198]:
#Deleting the previous model
del model

In [199]:
#Create an instance of the Linear Regression class from sklearn.
model = LinearRegression()

In [200]:
#Call the fit function , which applies the Linear Regression model
model.fit( X_train, y_train)

LinearRegression()

In [201]:
#Define a variable that stores the predictions of our model .
y_pred = model.predict(X_test)
#Print the mean squared error and mean absolute error to get an idea of how well the model has done .
mean_squared_error(y_pred, y_test), mean_absolute_error(y_pred, y_test)

(1995087811509.7795, 844323.1253167955)

In [202]:
#This is the best feature untill now
mean_absolute_error(y_pred, y_test)

844323.1253167955

## Linear Regression with inv_car_price - the column with 0.51 correlation

In [203]:
X = df['std_invprice']
y = df['price']

In [204]:
#Storing the values in a numpy array and reshaping it so that we can work with it
X = np.array(X)
X = X.reshape(-1, 1)

In [205]:
#Splitting the data in train and test with train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [206]:
#Deleting the previous model
del model

In [207]:
#Create an instance of the Linear Regression class from sklearn.
model = LinearRegression()

In [208]:
#Call the fit function , which applies the Linear Regression model
model.fit( X_train, y_train)

LinearRegression()

In [209]:
#Define a variable that stores the predictions of our model .
y_pred = model.predict(X_test)
#Print the mean squared error and mean absolute error to get an idea of how well the model has done .
mean_squared_error(y_pred, y_test), mean_absolute_error(y_pred, y_test)

(1995087811509.7795, 844323.1253167957)

## Linear Regression with more features - year, car_age, ...

In [210]:
X = df[['year', 'car_age',
        'fuel_type_Diesel', 'fuel_type_Petrol']]

y = df['price']

In [211]:
#Splitting the data in train and test with train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [212]:
#Deleting the previous model
del model

In [213]:
#Create an instance of the Linear Regression class from sklearn.
model = LinearRegression()

In [214]:
#Call the fit function , which applies the Linear Regression model
model.fit( X_train, y_train)

LinearRegression()

In [215]:
#Define a variable that stores the predictions of our model .
y_pred = model.predict(X_test)
#Print the mean squared error and mean absolute error to get an idea of how well the model has done .
mean_squared_error(y_pred, y_test), mean_absolute_error(y_pred, y_test)

(2295920243217.421, 929833.1891891892)

## Linear Regression with more features - year, car_age, ...
#### This time including std_invprice and inv_car_price 

In [221]:
df = df.fillna(0)

In [256]:
X = df[['year', 'car_age',
        'fuel_type_Diesel', 'fuel_type_Petrol','inv_car_price', 'std_invprice', 'inv_car_age']].values

y = df['price'].values

In [257]:
X[np.where(np.isinf(X))] = 0

In [258]:
#Splitting the data in train and test with train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [259]:
#Deleting the previous model
del model

In [260]:
#Create an instance of the Linear Regression class from sklearn.
model = LinearRegression()

In [261]:
#Call the fit function , which applies the Linear Regression model
model.fit( X_train, y_train)

LinearRegression()

In [262]:
#Define a variable that stores the predictions of our model .
y_pred = model.predict(X_test)
#Print the mean squared error and mean absolute error to get an idea of how well the model has done .
mean_squared_error(y_pred, y_test), mean_absolute_error(y_pred, y_test)

(1934306055653.9304, 829775.3359073359)