# Car Prediction Model

## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

## Loading the dataset

In [2]:
train_data=pd.read_csv("train.csv")
test_data=pd.read_csv("test.csv")

## Exploring the dataset

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [4]:
train_data.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4


## Converting the Year feature to the age of the car till date

In [5]:
train_data['Prod. year'] = pd.to_datetime(train_data['Prod. year'], format='%Y').dt.year

current_year = 2023 
train_data['Car Age'] = current_year - train_data['Prod. year']

In [6]:
test_data['Prod. year'] = pd.to_datetime(test_data['Prod. year'], format='%Y').dt.year

current_year = 2023  
test_data['Car Age'] = current_year - test_data['Prod. year']

In [7]:
train_data.drop(columns=['Prod. year'],axis=1,inplace=True)

In [8]:
test_data.drop(columns=['Prod. year'],axis=1,inplace=True)

In [9]:
train_data.head()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Car Age
0,45654403,13328,1399,LEXUS,RX 450,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12,13
1,44731507,16621,1018,CHEVROLET,Equinox,Jeep,No,Petrol,3.0,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8,12
2,45774419,8467,-,HONDA,FIT,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2,17
3,45769185,3607,862,FORD,Escape,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0,12
4,45809263,11726,446,HONDA,FIT,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4,9


In [10]:
train_data.describe()

Unnamed: 0,ID,Price,Cylinders,Airbags,Car Age
count,19237.0,19237.0,19237.0,19237.0,19237.0
mean,45576540.0,18555.93,4.582991,6.582627,12.087176
std,936591.4,190581.3,1.199933,4.320168,5.668673
min,20746880.0,1.0,1.0,0.0,3.0
25%,45698370.0,5331.0,4.0,4.0,8.0
50%,45772310.0,13172.0,4.0,6.0,11.0
75%,45802040.0,22075.0,4.0,12.0,14.0
max,45816650.0,26307500.0,16.0,16.0,84.0


## Remove 'km' from the 'Mileage' feature and convert to numeric

In [11]:
train_data['Mileage'] = train_data['Mileage'].str.replace(' km', '', regex=False).astype(int)

In [12]:
test_data['Mileage'] = test_data['Mileage'].str.replace(' km', '', regex=False).astype(int)

## Converting feature levy to numeric values

In [13]:
train_data['Levy'] = pd.to_numeric(train_data['Levy'], errors='coerce')

In [14]:
test_data['Levy'] = pd.to_numeric(test_data['Levy'], errors='coerce') 

In [15]:
train_data.isnull().sum()

ID                     0
Price                  0
Levy                5819
Manufacturer           0
Model                  0
Category               0
Leather interior       0
Fuel type              0
Engine volume          0
Mileage                0
Cylinders              0
Gear box type          0
Drive wheels           0
Doors                  0
Wheel                  0
Color                  0
Airbags                0
Car Age                0
dtype: int64

## As there are many null values in levy so dropping the column

In [16]:
train_data.drop(columns=['Levy'],axis=1,inplace=True)

In [17]:
test_data.drop(columns=['Levy'],axis=1,inplace=True)

## Removig the string turbo from the feature Engine volume and making it a separate feature also making the Engine volume as an int 

In [18]:
train_data['Turbo'] = train_data['Engine volume'].str.contains('Turbo', case=False).astype(int)
train_data['Engine volume'] = train_data['Engine volume'].str.extract(r'(\d+\.\d+|\d+)').astype(float)

In [19]:
test_data['Turbo'] = test_data['Engine volume'].str.contains('Turbo', case=False).astype(int)
test_data['Engine volume'] = test_data['Engine volume'].str.extract(r'(\d+\.\d+|\d+)').astype(float)

## Dropping the irrelevent columns 

In [20]:
train_data.drop(columns=['ID'],axis=1,inplace=True)

In [21]:
test_data.drop(columns=['ID'],axis=1,inplace=True)

In [22]:
train_data.drop(columns=['Model'],axis=1,inplace=True)

In [23]:
test_data.drop(columns=['Model'],axis=1,inplace=True)

In [24]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Price             19237 non-null  int64  
 1   Manufacturer      19237 non-null  object 
 2   Category          19237 non-null  object 
 3   Leather interior  19237 non-null  object 
 4   Fuel type         19237 non-null  object 
 5   Engine volume     19237 non-null  float64
 6   Mileage           19237 non-null  int32  
 7   Cylinders         19237 non-null  float64
 8   Gear box type     19237 non-null  object 
 9   Drive wheels      19237 non-null  object 
 10  Doors             19237 non-null  object 
 11  Wheel             19237 non-null  object 
 12  Color             19237 non-null  object 
 13  Airbags           19237 non-null  int64  
 14  Car Age           19237 non-null  int64  
 15  Turbo             19237 non-null  int32  
dtypes: float64(2), int32(2), int64(3), objec

## Applying one hot encoding  to categorical values

In [25]:
train_data = pd.get_dummies(train_data, drop_first=True)

In [26]:
test_data = pd.get_dummies(test_data, drop_first=True)

In [27]:
train_data.head()

Unnamed: 0,Price,Engine volume,Mileage,Cylinders,Airbags,Car Age,Turbo,Manufacturer_ALFA ROMEO,Manufacturer_ASTON MARTIN,Manufacturer_AUDI,...,Color_Green,Color_Grey,Color_Orange,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow
0,13328,3.5,186005,6.0,12,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,16621,3.0,192000,6.0,8,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8467,1.3,200000,4.0,2,17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3607,2.5,168966,4.0,0,12,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,11726,1.3,91901,4.0,4,9,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [28]:
feature_names = train_data.columns
print(feature_names)

Index(['Price', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags', 'Car Age',
       'Turbo', 'Manufacturer_ALFA ROMEO', 'Manufacturer_ASTON MARTIN',
       'Manufacturer_AUDI',
       ...
       'Color_Green', 'Color_Grey', 'Color_Orange', 'Color_Pink',
       'Color_Purple', 'Color_Red', 'Color_Silver', 'Color_Sky blue',
       'Color_White', 'Color_Yellow'],
      dtype='object', length=111)


In [29]:
test_data.head()

Unnamed: 0,Engine volume,Mileage,Cylinders,Airbags,Price,Car Age,Turbo,Manufacturer_ALFA ROMEO,Manufacturer_ASTON MARTIN,Manufacturer_AUDI,...,Color_Green,Color_Grey,Color_Orange,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow
0,2.0,0,4,10,18817.0,11,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2.4,26000,4,10,22580.0,11,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1.5,168000,4,8,8154.0,18,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3.2,143000,6,12,470.0,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.6,200000,4,0,4500.0,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
test_data.head()

Unnamed: 0,Engine volume,Mileage,Cylinders,Airbags,Price,Car Age,Turbo,Manufacturer_ALFA ROMEO,Manufacturer_ASTON MARTIN,Manufacturer_AUDI,...,Color_Green,Color_Grey,Color_Orange,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow
0,2.0,0,4,10,18817.0,11,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,2.4,26000,4,10,22580.0,11,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1.5,168000,4,8,8154.0,18,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3.2,143000,6,12,470.0,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.6,200000,4,0,4500.0,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Getting the columns which are not present in one but present in the other

In [31]:
columns_df1 = set(train_data.columns)
columns_df2 = set(test_data.columns)

different_columns = columns_df1.symmetric_difference(columns_df2)

print("Columns that are different:")
for column in different_columns:
    print(column)


Columns that are different:
Manufacturer_HAVAL
Manufacturer_ROLLS-ROYCE
Manufacturer_LAMBORGHINI
Manufacturer_PONTIAC
Manufacturer_TATA
Manufacturer_MG
Manufacturer_LANCIA
Manufacturer_FOTON
Manufacturer_SEAT


In [32]:
columns_df1 = set(train_data.columns)
columns_df2 = set(test_data.columns)

columns_only_in_df1 = columns_df1 - columns_df2
columns_only_in_df2 = columns_df2 - columns_df1

print("Columns only in df1:")
for column in columns_only_in_df1:
    print(f"{column} is in df1 but not in df2")

print("Columns only in df2:")
for column in columns_only_in_df2:
    print(f"{column} is in df2 but not in df1")


Columns only in df1:
Manufacturer_PONTIAC is in df1 but not in df2
Manufacturer_ROLLS-ROYCE is in df1 but not in df2
Manufacturer_LAMBORGHINI is in df1 but not in df2
Manufacturer_HAVAL is in df1 but not in df2
Manufacturer_SEAT is in df1 but not in df2
Manufacturer_LANCIA is in df1 but not in df2
Columns only in df2:
Manufacturer_FOTON is in df2 but not in df1
Manufacturer_MG is in df2 but not in df1
Manufacturer_TATA is in df2 but not in df1


## Adding the columns which are not present in the respective dataframe

In [33]:
new_column_names = ['Manufacturer_FOTON', 'Manufacturer_TATA', 'Manufacturer_MG']

for column_name in new_column_names:
    train_data[column_name] = 0

In [34]:
new_column_names = ['Manufacturer_HAVAL', 'Manufacturer_LAMBORGHINI', 'Manufacturer_PONTIAC','Manufacturer_SEAT','Manufacturer_ROLLS-ROYCE','Manufacturer_LANCIA']

for column_name in new_column_names:
    test_data[column_name] = 0

In [35]:
test_data.head()

Unnamed: 0,Engine volume,Mileage,Cylinders,Airbags,Price,Car Age,Turbo,Manufacturer_ALFA ROMEO,Manufacturer_ASTON MARTIN,Manufacturer_AUDI,...,Color_Silver,Color_Sky blue,Color_White,Color_Yellow,Manufacturer_HAVAL,Manufacturer_LAMBORGHINI,Manufacturer_PONTIAC,Manufacturer_SEAT,Manufacturer_ROLLS-ROYCE,Manufacturer_LANCIA
0,2.0,0,4,10,18817.0,11,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.4,26000,4,10,22580.0,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.5,168000,4,8,8154.0,18,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,3.2,143000,6,12,470.0,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.6,200000,4,0,4500.0,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
train_data.head()

Unnamed: 0,Price,Engine volume,Mileage,Cylinders,Airbags,Car Age,Turbo,Manufacturer_ALFA ROMEO,Manufacturer_ASTON MARTIN,Manufacturer_AUDI,...,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow,Manufacturer_FOTON,Manufacturer_TATA,Manufacturer_MG
0,13328,3.5,186005,6.0,12,13,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,16621,3.0,192000,6.0,8,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8467,1.3,200000,4.0,2,17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3607,2.5,168966,4.0,0,12,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,11726,1.3,91901,4.0,4,9,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## Making the columns in order of both test and train set

In [37]:
training_column_order = train_data.columns
test_data = test_data[training_column_order]

In [38]:
train_data.head()

Unnamed: 0,Price,Engine volume,Mileage,Cylinders,Airbags,Car Age,Turbo,Manufacturer_ALFA ROMEO,Manufacturer_ASTON MARTIN,Manufacturer_AUDI,...,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow,Manufacturer_FOTON,Manufacturer_TATA,Manufacturer_MG
0,13328,3.5,186005,6.0,12,13,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,16621,3.0,192000,6.0,8,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8467,1.3,200000,4.0,2,17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3607,2.5,168966,4.0,0,12,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,11726,1.3,91901,4.0,4,9,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [39]:
test_data.head()

Unnamed: 0,Price,Engine volume,Mileage,Cylinders,Airbags,Car Age,Turbo,Manufacturer_ALFA ROMEO,Manufacturer_ASTON MARTIN,Manufacturer_AUDI,...,Color_Pink,Color_Purple,Color_Red,Color_Silver,Color_Sky blue,Color_White,Color_Yellow,Manufacturer_FOTON,Manufacturer_TATA,Manufacturer_MG
0,18817.0,2.0,0,4,10,11,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,22580.0,2.4,26000,4,10,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8154.0,1.5,168000,4,8,18,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,470.0,3.2,143000,6,12,11,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4500.0,1.6,200000,4,0,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Separating the Input and output from the dataset

In [40]:
X_train = train_data.iloc[:,1:]
y_train = train_data.iloc[:,0]

In [41]:
X_test = test_data.iloc[:,1:]
y_test = test_data.iloc[:,0]

In [42]:
y_test.head()

0    18817.0
1    22580.0
2     8154.0
3      470.0
4     4500.0
Name: Price, dtype: float64

## Traning the model through Decision trees

In [43]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()
model.fit(X_train, y_train)

DecisionTreeRegressor()

## Evaluating the model through Mean squared error and the R squared metrics

In [44]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred = model.predict(X_train)

mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 3806812.727854393
R-squared (R2): 0.9998951849472252


## Predicting the test set and saving it in the test.csv file

In [45]:
predictions= model.predict(X_test)

print(predictions)

[ 1500.         22580.          7700.         ...   540.15384615
 35839.          5331.        ]


In [46]:
# Load the 'test.csv' dataset into a DataFrame
df = pd.read_csv('test.csv')

# Replace the 'Price' column with your predicted values
# Suppose 'prediction' is a NumPy array or a list of predicted values
# Make sure the length of 'prediction' matches the number of rows in your DataFrame
# Replace with your actual predictions

df['Price'] = predictions

In [47]:
df.head()

Unnamed: 0,ID,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags,Price
0,44020629,-,VOLKSWAGEN,Golf,2012,Hatchback,No,Diesel,2.0 Turbo,0 km,4,Manual,Front,02-Mar,Left wheel,Grey,10,1500.0
1,45784798,-,HYUNDAI,Sonata,2012,Sedan,Yes,Petrol,2.4,26000 km,4,Tiptronic,Front,04-May,Left wheel,Grey,10,22580.0
2,45800257,-,NISSAN,Tiida,2005,Sedan,No,Petrol,1.5,168000 km,4,Automatic,Front,04-May,Right-hand drive,Sky blue,8,7700.0
3,45797981,975,VOLVO,XC90,2012,Jeep,Yes,Petrol,3.2,143000 km,6,Automatic,4x4,04-May,Left wheel,Blue,12,282.0
4,45814303,-,OPEL,Astra,1993,Hatchback,No,Petrol,1.6,200000 km,4,Manual,Front,04-May,Left wheel,Black,0,4500.0


In [48]:
# Save the modified DataFrame back to 'test.csv'
df.to_csv('test.csv', index=False)  # index=False to avoid saving the index column