In [4]:
import numpy as np
import pandas as pd

np.random.seed(42)
n = 15000  # > 10k

# Mainstream + luxury (luxury kept rare)
brands = np.array([
    "Maruti","Hyundai","Honda","Tata","Mahindra","Toyota","Ford","Renault","Kia","Skoda",
    "Volkswagen","Nissan",
    "BMW","Mercedes-Benz","Audi","Jaguar","Land Rover","Volvo","Lexus","Porsche"
])

brand_probs = np.array([
    0.20, 0.16, 0.10, 0.10, 0.08, 0.08, 0.06, 0.05, 0.05, 0.04,
    0.035, 0.025,
    0.010, 0.010, 0.009, 0.003, 0.003, 0.004, 0.003, 0.002
])
brand_probs = brand_probs / brand_probs.sum()

fuel_types = np.array(["Petrol", "Diesel", "CNG", "Electric"])
fuel_probs  = np.array([0.58, 0.33, 0.06, 0.03])

owners = np.array(["First Owner", "Second Owner", "Third Owner", "Fourth & Above Owner"])
owner_probs = np.array([0.70, 0.20, 0.08, 0.02])

brand = np.random.choice(brands, size=n, p=brand_probs)
fuel  = np.random.choice(fuel_types, size=n, p=fuel_probs)
owner = np.random.choice(owners, size=n, p=owner_probs)

# km driven (skewed)
km_driven = np.random.lognormal(mean=np.log(45000), sigma=0.60, size=n)
km_driven = np.clip(km_driven, 1500, 300000).round().astype(int)

# Base price by brand (INR)
brand_base_map = {
    "Maruti": 450000, "Hyundai": 520000, "Honda": 600000, "Tata": 500000, "Mahindra": 650000,
    "Toyota": 800000, "Ford": 520000, "Renault": 480000, "Kia": 750000, "Skoda": 900000,
    "Volkswagen": 950000, "Nissan": 520000,
    "BMW": 5200000, "Mercedes-Benz": 5500000, "Audi": 5000000, "Jaguar": 6200000,
    "Land Rover": 7000000, "Volvo": 4800000, "Lexus": 6500000, "Porsche": 11000000
}
brand_base = pd.Series(brand).map(brand_base_map).to_numpy()

# Fuel adjustment
fuel_adj = pd.Series(fuel).map({
    "Petrol": 0, "Diesel": 50000, "CNG": -35000, "Electric": 300000
}).to_numpy()

# Owner adjustment
owner_adj = pd.Series(owner).map({
    "First Owner": 0, "Second Owner": -60000, "Third Owner": -120000, "Fourth & Above Owner": -180000
}).to_numpy()

# Depreciation rate per km by brand (luxury depreciates more per km)
dep_map = {
    "Maruti": 2.2, "Hyundai": 2.6, "Honda": 2.8, "Tata": 2.5, "Mahindra": 3.0,
    "Toyota": 2.4, "Ford": 2.7, "Renault": 2.4, "Kia": 3.1, "Skoda": 3.4,
    "Volkswagen": 3.6, "Nissan": 2.6,
    "BMW": 18.0, "Mercedes-Benz": 19.0, "Audi": 17.0, "Jaguar": 21.0,
    "Land Rover": 23.0, "Volvo": 16.0, "Lexus": 18.5, "Porsche": 28.0
}
dep_per_km = pd.Series(brand).map(dep_map).to_numpy()

# Noise (luxury has larger variance)
luxury_set = {"BMW","Mercedes-Benz","Audi","Jaguar","Land Rover","Volvo","Lexus","Porsche"}
is_lux = pd.Series(brand).isin(luxury_set).to_numpy().astype(int)
noise = np.random.normal(0, 50000 + 90000*is_lux, size=n)

selling_price = brand_base + fuel_adj + owner_adj - (dep_per_km * km_driven) + noise
selling_price = np.clip(selling_price, 70000, 25000000).round().astype(int)

df_cars = pd.DataFrame({
    "brand": brand,
    "km_driven": km_driven,
    "fuel": fuel,
    "owner": owner,
    "selling_price": selling_price
})

df_cars.head()


Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Honda,82589,Petrol,First Owner,348850
1,Nissan,32480,Diesel,First Owner,477855
2,Ford,98295,Petrol,Third Owner,150760
3,Mahindra,25473,Petrol,First Owner,666616
4,Maruti,108887,Petrol,First Owner,163105


In [7]:
df = df_cars
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,10554
Second Owner,2922
Third Owner,1212
Fourth & Above Owner,312


In [8]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_Electric,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Third Owner
0,Honda,82589,348850,False,False,False,True,True,False,False,False
1,Nissan,32480,477855,False,True,False,False,True,False,False,False
2,Ford,98295,150760,False,False,False,True,False,False,False,True
3,Mahindra,25473,666616,False,False,False,True,True,False,False,False
4,Maruti,108887,163105,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
14995,Toyota,27246,706029,True,False,False,False,True,False,False,False
14996,Maruti,34171,459287,False,True,False,False,True,False,False,False
14997,Maruti,34723,390530,False,False,False,True,True,False,False,False
14998,Toyota,28960,808165,False,False,False,True,True,False,False,False


In [9]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_Electric,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Third Owner
0,Honda,82589,348850,False,False,True,False,False,False
1,Nissan,32480,477855,True,False,False,False,False,False
2,Ford,98295,150760,False,False,True,False,False,True
3,Mahindra,25473,666616,False,False,True,False,False,False
4,Maruti,108887,163105,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...
14995,Toyota,27246,706029,False,False,False,False,False,False
14996,Maruti,34171,459287,True,False,False,False,False,False
14997,Maruti,34723,390530,False,False,True,False,False,False
14998,Toyota,28960,808165,False,False,True,False,False,False


In [10]:
x = df.iloc[:, :4]
y = df.iloc[:, -1]


In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=41)


In [22]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop = "first")
x_train_new = ohe.fit_transform(x_train[['fuel', 'owner']])

In [23]:
x_train_new.toarray()

array([[0., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0.]])

In [24]:
x_test_new = ohe.transform(x_test[['fuel', 'owner']])

In [25]:
d = np.hstack((x_train[['brand', 'km_driven']].values, x_train_new.toarray()))

In [26]:
d

array([['Tata', 56048, 0.0, ..., 0.0, 1.0, 0.0],
       ['Maruti', 70405, 0.0, ..., 0.0, 0.0, 0.0],
       ['Tata', 50651, 1.0, ..., 0.0, 0.0, 1.0],
       ...,
       ['Maruti', 26676, 0.0, ..., 0.0, 0.0, 0.0],
       ['Kia', 37621, 0.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 38393, 0.0, ..., 0.0, 1.0, 0.0]], dtype=object)