In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("C:/Users/patan/DS/Machine Learning/Dataset/cars.csv")
print(data.shape)
data.head()

(8128, 5)


Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500.0,Diesel,First Owner,450000
1,Skoda,120000.0,Diesel,Second Owner,370000
2,Honda,140000.0,Petrol,Third Owner,158000
3,Hyundai,127000.0,Diesel,First Owner,225000
4,Maruti,120000.0,Petrol,First Owner,130000


In [3]:
data["owner"].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [13]:
data.isnull().sum()

brand            0
km_driven        4
fuel             0
owner            0
selling_price    0
dtype: int64

In [5]:
len(data.isnull().sum())/len(data)

0.0006151574803149606

In [6]:
data.isnull().mean()

brand            0.000000
km_driven        0.000492
fuel             0.000000
owner            0.000000
selling_price    0.000000
dtype: float64

In [7]:
X = data.drop("selling_price", axis = 1)

In [8]:
X.head()

Unnamed: 0,brand,km_driven,fuel,owner
0,Maruti,145500.0,Diesel,First Owner
1,Skoda,120000.0,Diesel,Second Owner
2,Honda,140000.0,Petrol,Third Owner
3,Hyundai,127000.0,Diesel,First Owner
4,Maruti,120000.0,Petrol,First Owner


In [9]:
y = data["selling_price"]
y

0       450000
1       370000
2       158000
3       225000
4       130000
         ...  
8123    320000
8124    135000
8125    382000
8126    290000
8127    290000
Name: selling_price, Length: 8128, dtype: int64

In [10]:
X_train, X_test, y_train,t_test = train_test_split(X, y ,test_size = 0.2, shuffle = False )

In [11]:
X_train.shape

(6502, 4)

In [12]:
ct = ColumnTransformer(transformers= [("imputer1", SimpleImputer(strategy="median"), ["km_driven"]),
                                      ("imputer2" , OrdinalEncoder(categories=[['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car']]), ["owner"]),("imputer3" , OneHotEncoder(sparse_output = False) , ["brand","fuel"])] ,remainder =  "passthrough")

In [13]:
ct

In [14]:
ct_array = ct.fit_transform(X_train)

In [15]:
ct_array

array([[1.455e+05, 0.000e+00, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.200e+05, 1.000e+00, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [1.400e+05, 2.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       ...,
       [3.500e+04, 0.000e+00, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [4.300e+04, 0.000e+00, 0.000e+00, ..., 1.000e+00, 0.000e+00,
        0.000e+00],
       [6.000e+04, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [16]:
# Constructing columns 

In [17]:
columns = [] 

In [18]:
columns.extend(["encoded_km_driven","encoded_owner"])
columns = columns + list(ct.named_transformers_['imputer3'].get_feature_names_out())
print(len(columns))

37


In [19]:
ct_arrays = ct_array.astype(int)

In [20]:
ct_array.shape

(6502, 37)

In [21]:
ct_arrays

array([[145500,      0,      0, ...,      1,      0,      0],
       [120000,      1,      0, ...,      1,      0,      0],
       [140000,      2,      0, ...,      0,      0,      1],
       ...,
       [ 35000,      0,      0, ...,      1,      0,      0],
       [ 43000,      0,      0, ...,      1,      0,      0],
       [ 60000,      0,      0, ...,      0,      0,      1]])

In [22]:
print(len(data["brand"].unique()))
brand = data["brand"].unique()

32


In [23]:
print(len(data["fuel"].unique()))
data["fuel"].unique()

4


array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

In [24]:
new_data = data
new_data

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500.0,Diesel,First Owner,450000
1,Skoda,120000.0,Diesel,Second Owner,370000
2,Honda,140000.0,Petrol,Third Owner,158000
3,Hyundai,127000.0,Diesel,First Owner,225000
4,Maruti,120000.0,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000.0,Petrol,First Owner,320000
8124,Hyundai,119000.0,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000.0,Diesel,First Owner,382000
8126,Tata,25000.0,Diesel,First Owner,290000


In [25]:
new_df =  pd.DataFrame(ct_array, columns=columns)

In [28]:
data.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500.0,Diesel,First Owner,450000
1,Skoda,120000.0,Diesel,Second Owner,370000
2,Honda,140000.0,Petrol,Third Owner,158000
3,Hyundai,127000.0,Diesel,First Owner,225000
4,Maruti,120000.0,Petrol,First Owner,130000


In [27]:
new_df.head()

Unnamed: 0,encoded_km_driven,encoded_owner,brand_Ambassador,brand_Ashok,brand_Audi,brand_BMW,brand_Chevrolet,brand_Daewoo,brand_Datsun,brand_Fiat,...,brand_Renault,brand_Skoda,brand_Tata,brand_Toyota,brand_Volkswagen,brand_Volvo,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol
0,145500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,120000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,140000.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,127000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,120000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
