# Importing Libraries and Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import GridSearchCV

# Reading the Dataset

In [5]:
data = pd.read_csv('../car_price_prediction/data/processes2.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
data.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,seats,max_power (in bph),Mileage Unit,Mileage,Engine (CC)
0,Maruti,2014,450000,145500,Diesel,Individual,Manual,First Owner,5,74.0,kmpl,23.4,1248
1,Hyundai,2010,225000,127000,Diesel,Individual,Manual,First Owner,5,90.0,kmpl,23.0,1396
2,Hyundai,2017,440000,45000,Petrol,Individual,Manual,First Owner,5,81.86,kmpl,20.14,1197
3,Toyota,2011,350000,90000,Diesel,Individual,Manual,First Owner,5,67.1,kmpl,23.59,1364
4,Ford,2013,200000,169000,Diesel,Individual,Manual,First Owner,5,68.1,kmpl,20.0,1399


In [9]:
# Columns and types of the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2095 entries, 0 to 2094
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                2095 non-null   object 
 1   year                2095 non-null   int64  
 2   selling_price       2095 non-null   int64  
 3   km_driven           2095 non-null   int64  
 4   fuel                2095 non-null   object 
 5   seller_type         2095 non-null   object 
 6   transmission        2095 non-null   object 
 7   owner               2095 non-null   object 
 8   seats               2095 non-null   int64  
 9   max_power (in bph)  2095 non-null   float64
 10  Mileage Unit        2095 non-null   object 
 11  Mileage             2095 non-null   float64
 12  Engine (CC)         2095 non-null   int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 212.9+ KB


- The dataset doesn't have missing values in any column;

- The dataset has 13 columns, 7 numeric columns and 6 categorical columns;

- The numeric columns are: year, selling_price, km_driven, seats, max_power, mileage, engine;

- The categorical columns are: name, fuel, seller_type, transmission, owner, mileage unit.

In [12]:
data

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,seats,max_power (in bph),Mileage Unit,Mileage,Engine (CC)
0,Maruti,2014,450000,145500,Diesel,Individual,Manual,First Owner,5,74.00,kmpl,23.40,1248
1,Hyundai,2010,225000,127000,Diesel,Individual,Manual,First Owner,5,90.00,kmpl,23.00,1396
2,Hyundai,2017,440000,45000,Petrol,Individual,Manual,First Owner,5,81.86,kmpl,20.14,1197
3,Toyota,2011,350000,90000,Diesel,Individual,Manual,First Owner,5,67.10,kmpl,23.59,1364
4,Ford,2013,200000,169000,Diesel,Individual,Manual,First Owner,5,68.10,kmpl,20.00,1399
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2090,Maruti,2017,425000,12000,Petrol,Individual,Manual,First Owner,5,67.04,kmpl,23.10,998
2091,Toyota,2014,425000,50000,Diesel,Individual,Manual,First Owner,5,67.06,kmpl,23.59,1364
2092,Maruti,2011,200000,73000,Petrol,Individual,Manual,First Owner,5,46.30,kmpl,19.70,796
2093,Maruti,2017,360000,80000,Petrol,Individual,Manual,First Owner,5,67.04,kmpl,20.51,998


In [23]:
pd.get_dummies(data, columns=['name', 'fuel', 'transmission', 'owner', 'seller_type', 'Mileage Unit']).iloc[:, 7:].astype(int)

Unnamed: 0,name_Chevrolet,name_Ford,name_Honda,name_Hyundai,name_Mahindra,name_Maruti,name_Renault,name_Tata,name_Toyota,name_Volkswagen,...,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner,seller_type_Dealer,seller_type_Individual,seller_type_Trustmark Dealer,Mileage Unit_km/kg,Mileage Unit_kmpl
0,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
1,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
4,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2090,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2091,0,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
2092,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
2093,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
