In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
df= pd.read_csv("/kaggle/input/electric-vehicle-specifications-dataset-2025/electric_vehicles_spec_2025.csv.csv")
df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,efficiency_wh_per_km,range_km,acceleration_0_100_s,...,towing_capacity_kg,cargo_volume_l,seats,drivetrain,segment,length_mm,width_mm,height_mm,car_body_type,source_url
0,Abarth,500e Convertible,155,37.8,Lithium-ion,192.0,235.0,156,225,7.0,...,0.0,185,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1904/Abarth-500e-C...
1,Abarth,500e Hatchback,155,37.8,Lithium-ion,192.0,235.0,149,225,7.0,...,0.0,185,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1903/Abarth-500e-H...
2,Abarth,600e Scorpionissima,200,50.8,Lithium-ion,102.0,345.0,158,280,5.9,...,0.0,360,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3057/Abarth-600e-S...
3,Abarth,600e Turismo,200,50.8,Lithium-ion,102.0,345.0,158,280,6.2,...,0.0,360,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3056/Abarth-600e-T...
4,Aiways,U5,150,60.0,Lithium-ion,,310.0,156,315,7.5,...,,496,5,FWD,JC - Medium,4680,1865,1700,SUV,https://ev-database.org/car/1678/Aiways-U5


In [3]:
df['cargo_volume_l'] = df['cargo_volume_l'].str.extract('(\d+)')
df['cargo_volume_l'] = pd.to_numeric(df['cargo_volume_l'], errors='coerce')

In [4]:
X= df.drop(columns=["brand","model","segment","source_url","car_body_type"])
y= df["car_body_type"]

In [5]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=2)
X_train.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,efficiency_wh_per_km,range_km,acceleration_0_100_s,fast_charging_power_kw_dc,fast_charge_port,towing_capacity_kg,cargo_volume_l,seats,drivetrain,length_mm,width_mm,height_mm
289,150,46.3,Lithium-ion,216.0,260.0,131,290,8.7,78.0,CCS,0.0,267.0,5,FWD,4061,1765,1435
7,200,50.8,Lithium-ion,102.0,345.0,164,310,6.0,85.0,CCS,0.0,400.0,5,FWD,4173,1781,1505
146,200,76.5,Lithium-ion,,600.0,193,320,6.5,112.0,CCS,1500.0,438.0,5,AWD,5209,2010,1731
307,150,50.8,Lithium-ion,102.0,260.0,127,300,10.0,80.0,CCS,,608.0,5,FWD,4636,1852,1442
453,180,79.0,Lithium-ion,324.0,670.0,163,385,4.8,125.0,CCS,1800.0,410.0,5,AWD,4440,1863,1652


In [6]:
num_cols= ["top_speed_kmh", "battery_capacity_kWh", "number_of_cells", "torque_nm", "efficiency_wh_per_km", "range_km", "acceleration_0_100_s", "fast_charging_power_kw_dc", "towing_capacity_kg", "cargo_volume_l", "seats", "length_mm", "width_mm", "height_mm"]
cat_cols= ["battery_type", "fast_charge_port", "drivetrain"]

positions = [X_train.columns.get_loc(c) for c in num_cols]
print(positions)

[0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16]


In [7]:
NumCols= [0,1,3,4,5,6,7,8,10,11,12,14,15,16]
CatCols= [2,9,13]

transformer= ColumnTransformer([
    ("numCols", Pipeline([
        ("imputernum", KNNImputer()),
        ("scaler", StandardScaler())
    ]), NumCols),
    ("catCols", Pipeline([
        ("imputercat", SimpleImputer(strategy="most_frequent")),
        ("OHE", OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ]), CatCols)
], remainder= 'passthrough')

pipe= Pipeline([
    ("tnf", transformer)
])

In [8]:
X_train_tnf= pipe.fit_transform(X_train)
X_test_tnf= pipe.transform(X_test)

le= LabelEncoder()
y_train_tnf= le.fit_transform(y_train)
y_test_tnf= le.transform(y_test)

df0= pd.DataFrame(X_train_tnf)
df0




Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.983240,-1.340147,-0.344646,-0.961766,-0.940735,-0.976779,0.608139,-0.793665,-1.392446,-1.159610,-0.271941,-1.651519,-1.656367,-1.290609,1.0,1.0,0.0,0.0,1.0,0.0
1,0.438402,-1.123344,-0.438533,-0.615297,0.019189,-0.783673,-0.351669,-0.673248,-1.392446,-0.491373,-0.271941,-1.351937,-1.440399,-0.763733,1.0,1.0,0.0,0.0,1.0,0.0
2,0.438402,0.114847,-0.363423,0.424109,0.862759,-0.687120,-0.173927,-0.208781,0.656363,-0.300448,-0.271941,1.419200,1.650643,0.937323,1.0,1.0,0.0,1.0,0.0,0.0
3,-0.983240,-1.123344,-0.438533,-0.961766,-1.057090,-0.880226,1.070269,-0.759260,-1.050978,0.553690,-0.271941,-0.113485,-0.482041,-1.237922,1.0,1.0,0.0,0.0,1.0,0.0
4,-0.130255,0.235293,-0.255700,0.709436,-0.009899,-0.059524,-0.778251,0.014852,1.066124,-0.441129,-0.271941,-0.637754,-0.333563,0.342706,1.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377,-0.983240,-1.340147,-0.344646,-0.961766,-0.795292,-1.169886,1.034721,-0.793665,-1.392446,-0.320545,-0.271941,-1.001532,-1.521387,-0.628251,1.0,1.0,0.0,0.0,1.0,0.0
378,-0.130255,0.138936,-0.285349,0.746121,0.222810,0.037029,-0.138378,-0.071161,0.246601,0.111548,-0.271941,-0.241877,-0.306567,0.192170,1.0,1.0,0.0,1.0,0.0,0.0
379,0.864894,0.403918,-0.380883,-0.472634,0.222810,0.230135,-0.102830,-0.157173,-0.368042,0.111548,-0.271941,0.405435,0.503313,0.101848,1.0,1.0,0.0,0.0,0.0,1.0
380,0.722730,1.001333,-0.374294,1.463515,-0.475317,1.919816,-0.813799,1.305039,1.475886,0.021110,-0.271941,0.667569,0.476317,-1.140073,1.0,1.0,0.0,1.0,0.0,0.0


In [9]:
lr= DecisionTreeClassifier()
lr.fit(X_train_tnf, y_train_tnf)

In [10]:
y_pred= lr.predict(X_test_tnf)
accuracy_score(y_test_tnf, y_pred)

0.875