In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
import seaborn as sns

In [27]:
df = pd.read_csv('CAR DETAILS FROM CAR DEKHO.csv')
df.head(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [28]:
df.drop("seller_type",axis =1,inplace=True)
df.head(5)

Unnamed: 0,name,year,selling_price,km_driven,fuel,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Manual,Second Owner


In [29]:
df.shape

(4340, 7)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   transmission   4340 non-null   object
 6   owner          4340 non-null   object
dtypes: int64(3), object(4)
memory usage: 237.5+ KB


In [31]:
df['year'].unique()

array([2007, 2012, 2017, 2014, 2016, 2015, 2018, 2019, 2013, 2011, 2010,
       2009, 2006, 1996, 2005, 2008, 2004, 1998, 2003, 2002, 2020, 2000,
       1999, 2001, 1995, 1997, 1992], dtype=int64)

In [32]:
df.drop("owner",axis =1,inplace=True)
df.head(5) 

Unnamed: 0,name,year,selling_price,km_driven,fuel,transmission
0,Maruti 800 AC,2007,60000,70000,Petrol,Manual
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Manual
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Manual
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Manual
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Manual


In [33]:
df.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [34]:
df.to_csv('Cleaned Data.csv')

In [35]:
X = df.drop(columns='selling_price')
Y = df['selling_price']
print(X)
print(Y)

                                     name  year  km_driven    fuel  \
0                           Maruti 800 AC  2007      70000  Petrol   
1                Maruti Wagon R LXI Minor  2007      50000  Petrol   
2                    Hyundai Verna 1.6 SX  2012     100000  Diesel   
3                  Datsun RediGO T Option  2017      46000  Petrol   
4                   Honda Amaze VX i-DTEC  2014     141000  Diesel   
...                                   ...   ...        ...     ...   
4335  Hyundai i20 Magna 1.4 CRDi (Diesel)  2014      80000  Diesel   
4336           Hyundai i20 Magna 1.4 CRDi  2014      80000  Diesel   
4337                  Maruti 800 AC BSIII  2009      83000  Petrol   
4338     Hyundai Creta 1.6 CRDi SX Option  2016      90000  Diesel   
4339                     Renault KWID RXT  2016      40000  Petrol   

     transmission  
0          Manual  
1          Manual  
2          Manual  
3          Manual  
4          Manual  
...           ...  
4335       Manual  

In [36]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)
print(X_train)
print(X_test)

                                         name  year  km_driven    fuel  \
3581         Volkswagen Ameo 1.5 TDI Highline  2017      70000  Diesel   
4191  Mahindra Scorpio VLX 2WD AIRBAG SE BSIV  2012      72000  Diesel   
2450                  Maruti Ciaz 1.4 AT Zeta  2017      40000  Petrol   
984         Fiat Grande Punto EVO 1.3 Dynamic  2014      70000  Diesel   
3546                     Maruti Swift 1.2 DLX  2018      35000  Petrol   
...                                       ...   ...        ...     ...   
3335         Hyundai Grand i10 1.2 Kappa Asta  2018      32000  Petrol   
1099                     Honda City i DTEC VX  2014     110000  Diesel   
2514      Hyundai i20 Magna Optional 1.4 CRDi  2013      50000  Diesel   
3606                         Mahindra Xylo D2  2010      70000  Diesel   
2575                      Datsun RediGO 1.0 S  2017      15000  Petrol   

     transmission  
3581       Manual  
4191       Manual  
2450    Automatic  
984        Manual  
3546       

In [39]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

# Convert X_train and X_test to DataFrame if they are not already
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# Create the column transformer with SimpleImputer and OneHotEncoder
column_trans = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='constant', fill_value='missing'), OneHotEncoder(handle_unknown='ignore')), 
     ['name', 'fuel', 'transmission']),
    remainder='passthrough'
)

# Transform the data
X_train_transformed = column_trans.fit_transform(X_train)
X_test_transformed = column_trans.transform(X_test)

# Verify transformation by checking the shape of transformed data
print("Shape of X_train_transformed:", X_train_transformed.shape)
print("Shape of X_test_transformed:", X_test_transformed.shape)

# Optionally, create a DataFrame to see the transformed columns
transformed_feature_names = column_trans.get_feature_names_out()
X_train_transformed_df = pd.DataFrame(X_train_transformed.toarray(), columns=transformed_feature_names)
X_test_transformed_df = pd.DataFrame(X_test_transformed.toarray(), columns=transformed_feature_names)

print("Transformed X_train DataFrame:\n", X_train_transformed_df.head())
print("Transformed X_test DataFrame:\n", X_test_transformed_df.head())

# Create the pipeline
pipeline = make_pipeline(
    column_trans,
    RandomForestRegressor()
)

# Train the model
pipeline.fit(X_train, Y_train)

# Make predictions
Y_pred = pipeline.predict(X_test)

print("Predictions:", Y_pred)


Shape of X_train_transformed: (3472, 1342)
Shape of X_test_transformed: (868, 1342)
Transformed X_train DataFrame:
    pipeline__name_Ambassador CLASSIC 1500 DSL AC  \
0                                            0.0   
1                                            0.0   
2                                            0.0   
3                                            0.0   
4                                            0.0   

   pipeline__name_Ambassador Classic 2000 Dsz  \
0                                         0.0   
1                                         0.0   
2                                         0.0   
3                                         0.0   
4                                         0.0   

   pipeline__name_Ambassador Grand 1800 ISZ MPFI PW CL  \
0                                                0.0     
1                                                0.0     
2                                                0.0     
3                                           

In [42]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(Y_test, Y_pred)
print("Mean Absolute Error (MAE):", mae)

mse = mean_squared_error(Y_test, Y_pred)
print("Mean Squared Error (MSE):", mse)

r2 = r2_score(Y_test, Y_pred)
print("R-squared (R²):", r2)

Mean Absolute Error (MAE): 114211.51064776134
Mean Squared Error (MSE): 65048036263.13385
R-squared (R²): 0.7998285586470039
