In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#loading dataset
df = pd.read_csv('cleaned_data.csv')
df.head()

from sklearn.preprocessing import StandardScaler

# List of categorical columns
categorical_columns = ['Owner', 'Fuel_Type', 'Transmission', 'Insurance_Type', 'city']

# Apply one-hot encoding to categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Convert Boolean columns to integers (0 and 1)
df_encoded = df_encoded.astype(int)

# Display the encoded DataFrame
df_encoded.head()

# List of numerical columns
numerical_columns = ['Price', 'Rating', 'Kilometers', 'Car_Age']

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply scaling to the numerical columns in the encoded DataFrame
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

# Display the scaled DataFrame
df_encoded.head()

Unnamed: 0,Price,Rating,Kilometers,Car_Age,Owner_OwnerFifth Owner,Owner_OwnerFirst Owner,Owner_OwnerFourth Owner,Owner_OwnerNinth Owner,Owner_OwnerSecond Owner,Owner_OwnerSeventh Owner,...,Insurance_Type_Insurance Typenull,city_769,city_777,city_1692,city_2130,city_2378,city_2423,city_3686,city_4709,city_5732
0,-0.666593,0.638867,-0.7816,0.52741,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,-0.31427,0.638867,-0.693215,-0.137801,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.271806,0.638867,-1.000741,-0.359539,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,-0.526814,0.638867,-0.9156,-0.137801,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.387919,0.638867,-1.208079,-1.024751,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming 'df_encoded' is the DataFrame after encoding and scaling

# Step 1: Define your features (X) and target variable (y)
X = df_encoded.drop('Price', axis=1)  # Drop 'Price' column from X (features)
y = df_encoded['Price']  # Target variable (Price)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize the Decision Tree Regressor model
dt_regressor = DecisionTreeRegressor(random_state=42)

# Step 4: Train the model
dt_regressor.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred = dt_regressor.predict(X_test)

# Step 6: Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Step 7: Display the results
print(f"R-squared: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"Root Mean Squared Error: {rmse:.4f}")


R-squared: -0.0104
Mean Absolute Error: 0.5182
Mean Squared Error: 1.0019
Root Mean Squared Error: 1.0010


