In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the dataset
# Assuming your dataset is in a CSV file named 'car_data.csv'
df = pd.read_csv('Australian Vehicle Price.csv', na_values='-')

In [3]:
df.isnull().sum()

Brand                2293
Year                 2293
Model                2293
Car/Suv              2320
Title                2293
UsedOrNew            2293
Transmission         2515
Engine               3684
DriveType            2293
FuelType             2840
FuelConsumption      3704
Kilometres           2430
ColourExtInt         2293
Location             2742
CylindersinEngine    3684
BodyType             2527
Doors                3615
Seats                3711
Price                2295
dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
df.isnull().sum()

Brand                0
Year                 0
Model                0
Car/Suv              0
Title                0
UsedOrNew            0
Transmission         0
Engine               0
DriveType            0
FuelType             0
FuelConsumption      0
Kilometres           0
ColourExtInt         0
Location             0
CylindersinEngine    0
BodyType             0
Doors                0
Seats                0
Price                0
dtype: int64

In [6]:
# Select features and target variable
features = ['Brand', 'Year', 'Model', 'Transmission', 'Engine', 'FuelType']
target = 'UsedOrNew'

In [7]:
# Extract features and target variable
X = df[features]
y = df[target]

In [8]:
# Convert categorical features to numerical using Label Encoding
label_encoder = LabelEncoder()
X_encoded = X.apply(label_encoder.fit_transform)

In [9]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [10]:
# Create a decision tree classifier
model = DecisionTreeClassifier()

In [11]:
# Train the model
model.fit(X_train, y_train)

In [12]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [14]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [15]:
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Accuracy: 0.959026369168357
Confusion Matrix:
[[  36   23   22]
 [  16   53    8]
 [  20   12 2275]]
Classification Report:
              precision    recall  f1-score   support

        DEMO       0.50      0.44      0.47        81
         NEW       0.60      0.69      0.64        77
        USED       0.99      0.99      0.99      2307

    accuracy                           0.96      2465
   macro avg       0.70      0.71      0.70      2465
weighted avg       0.96      0.96      0.96      2465

