In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [5]:
# Load the data from CSV file to pandas DataFrame
data_file = "C:/Users/Piyusha/Car Price/Car_Info.csv"
car_dataset = pd.read_csv(data_file)


# Clean the data (handle missing values, duplicates, etc.)
car_dataset.dropna(inplace=True)
car_dataset.drop_duplicates(inplace=True)


In [6]:
car_dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp


In [3]:
def clean_data(value):
    if isinstance(value, str):
        value = value.split(' ')[0].strip()
        if value == '' or value == 'NaN':
            value = 0
    return float(value)

columns_to_clean = ['mileage', 'max_power', 'engine']
car_dataset[columns_to_clean] = car_dataset[columns_to_clean].applymap(clean_data)

In [4]:
# Encode categorical columns
car_dataset.replace({'fuel': {'Petrol': 0, 'Diesel': 1, 'CNG': 2, 'LPG': 3}}, inplace=True)
car_dataset.replace({'seller_type': {'Dealer': 0, 'Individual': 1, 'Trustmark Dealer': 2}}, inplace=True)
car_dataset.replace({'transmission': {'Manual': 0, 'Automatic': 1}}, inplace=True)
car_dataset.replace({'owner': {'First Owner': 0, 'Second Owner': 1, 'Third Owner': 2, 'Fourth & Above Owner': 3, 'Test Drive Car': 4}}, inplace=True)


In [5]:
# Define features and target variable
X = car_dataset.drop(columns=['selling_price', 'name'], axis=1)
y = car_dataset['selling_price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Initialize and train models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": lgb.LGBMRegressor(n_estimators=100, random_state=42),
}


In [7]:
predictions = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    
    print(f"{model_name} Performance:")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (R²): {r2}\n")
    
    # Save the trained model
    with open(f'{model_name.lower().replace(" ", "_")}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

Linear Regression Performance:
Mean Squared Error (MSE): 58884243380.3906
Root Mean Squared Error (RMSE): 242660.757808902
R-squared (R²): 0.6658477667371465

Random Forest Performance:
Mean Squared Error (MSE): 19415632216.693317
Root Mean Squared Error (RMSE): 139339.98785952767
R-squared (R²): 0.8898215126327188

Decision Tree Performance:
Mean Squared Error (MSE): 40626403229.3637
Root Mean Squared Error (RMSE): 201559.92466103897
R-squared (R²): 0.7694560957363017

KNN Performance:
Mean Squared Error (MSE): 79939191111.64719
Root Mean Squared Error (RMSE): 282735.1960963601
R-squared (R²): 0.5463666050249627

Gradient Boosting Performance:
Mean Squared Error (MSE): 23320934872.16933
Root Mean Squared Error (RMSE): 152711.93428206365
R-squared (R²): 0.8676599711238193

XGBoost Performance:
Mean Squared Error (MSE): 20340904230.888535
Root Mean Squared Error (RMSE): 142621.5419594408
R-squared (R²): 0.8845708429769732

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the ov