In [None]:
import pandas as pd

# Replace 'file_path.csv' with the path to your CSV file
file_path = 'pakwheeldataset.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Droping columns
data = data.drop(columns=['Offer URL', 'Last Updated Date'])

# Drop rows with null values in selected columns
selected_columns = ['AC/Heater', 'Rating', 'Exterior & Body', 'Engine/Transmission/Clutch',
                    'Suspension/Steering', 'Interior', 'Price', 'Model Date', 'Mileage', 'Fuel Type']
data.dropna(subset=selected_columns, inplace=True)

# Convert 'Rating' column to numeric
data['Rating'] = pd.to_numeric(data['Rating'].str.split('/').str[0], errors='coerce') * 10

# Replace missing values in 'Transmission' column with 'Manual'
data['Transmission'] = data['Transmission'].fillna('Manual')

# Replace missing values in 'Fuel Type' column with 'Electric'
data['Fuel Type'] = data['Fuel Type'].fillna('Electric')

# Remove 'PKR' and 'lacs'/'crore', then convert to numerical form
data['Price'] = data['Price'].str.replace('PKR', '').str.replace(' lacs', 'e4').str.replace(' crore', 'e7').astype(float)

# Remove ' km' from 'Mileage' column
data['Mileage'] = data['Mileage'].str.replace(' km', '').str.replace(',', '').astype(int)

# Remove '%' from 'Interior' column
data['Interior'] = data['Interior'].str.replace('%', '').str.replace(',', '').astype(int)

# Remove '%' from 'Suspension/Steering' column
data['Suspension/Steering'] = data['Suspension/Steering'].str.replace('%', '').str.replace(',', '').astype(int)

# Remove '%' from 'Engine/Transmission/Clutch' column
data['Engine/Transmission/Clutch'] = data['Engine/Transmission/Clutch'].str.replace('%', '').str.replace(',', '').astype(int)

# Remove '%' from 'Exterior & Body' column
data['Exterior & Body'] = data['Exterior & Body'].str.replace('%', '').str.replace(',', '').astype(int)

# Remove '%' from 'AC/Heater' column
data['AC/Heater'] = data['AC/Heater'].str.replace('%', '').str.replace(',', '').astype(int)

# Create a new column 'Company' containing the first word of 'Car Name'
data['Company'] = data['Car Name'].str.split().str[0]

# Create a new column 'Variant' containing data from 'Car Name' without first and last words
data['Variant'] = data['Car Name'].str.split().apply(lambda x: ' '.join(x[1:]))

# Apply a mapping function to convert 'Manual' to 0 and other values to 1
data['Transmission'] = data['Transmission'].map(lambda x: 0 if x == 'Manual' else 1)

# Drop columns that are no longer needed
data = data.drop(columns=['Car Name'])

# Convert 'Model Date' to integer
data['Model Date'] = data['Model Date'].astype(int)

# Convert 'Price' to integer
data['Price'] = data['Price'].astype(int)

# Display the first few rows of the DataFrame
print(data.info())

# Save cleaned data to a separate file
data.to_csv('cleaned_data_no_encoding.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 3107 entries, 0 to 3345
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Rating                      2881 non-null   float64
 1   Exterior & Body             3107 non-null   int64  
 2   Engine/Transmission/Clutch  3107 non-null   int64  
 3   Suspension/Steering         3107 non-null   int64  
 4   Interior                    3107 non-null   int64  
 5   AC/Heater                   3107 non-null   int64  
 6   Price                       3107 non-null   int64  
 7   Model Date                  3107 non-null   int64  
 8   Mileage                     3107 non-null   int64  
 9   Fuel Type                   3107 non-null   object 
 10  Transmission                3107 non-null   int64  
 11  Company                     3107 non-null   object 
 12  Variant                     3107 non-null   object 
dtypes: float64(1), int64(9), object(3)
mem

In [None]:
#XGboost
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data['Company'] = encoder.fit_transform(data['Company'])
data['Variant'] = encoder.fit_transform(data['Variant'])
data['Model Date'] = encoder.fit_transform(data['Model Date'])
data['Fuel Type'] = encoder.fit_transform(data['Fuel Type'])

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate an XGBoost regressor
xgb_model = XGBRegressor()

# Train the XGBoost model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)


Mean Squared Error (MSE): 1112582952585.087
Normalized Mean Squared Error (NMSE): 0.07005130246324502
R-squared (R2): 0.9910412498625102


In [None]:
#Decision tree

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(encoder.fit_transform)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a Decision Tree Regressor
dt_regressor = DecisionTreeRegressor()

# Train the model
dt_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_regressor.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)


Mean Squared Error (MSE): 70678157879849.05
Normalized Mean Squared Error (NMSE): 4.4500924660785195
R-squared (R2): 0.43088472176172543


In [None]:
#Random FOrest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(encoder.fit_transform)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a Random Forest Regressor
rf_regressor = RandomForestRegressor()

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)



Mean Squared Error (MSE): 5123545764981.261
Normalized Mean Squared Error (NMSE): 0.32259262397743055
R-squared (R2): 0.9587441401831557


In [None]:
#Gradient Boost
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(encoder.fit_transform)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor()

# Train the model
gb_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = gb_regressor.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)



Mean Squared Error (MSE): 10085069034747.889
Normalized Mean Squared Error (NMSE): 0.6349838631576571
R-squared (R2): 0.9187929193129246


In [None]:
#K-Nearest Neighbours
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(encoder.fit_transform)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate a KNeighborsRegressor
knn_regressor = KNeighborsRegressor()

# Train the model
knn_regressor.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn_regressor.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)


Mean Squared Error (MSE): 32815698860776.207
Normalized Mean Squared Error (NMSE): 2.0661672382250487
R-squared (R2): 0.7357611439239456


In [None]:
#Simple regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(encoder.fit_transform)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a Linear Regression model
linear_regressor = LinearRegression()

# Train the model
linear_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = linear_regressor.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)



Mean Squared Error (MSE): 108462797793820.33
Normalized Mean Squared Error (NMSE): 6.8291179876618004
R-squared (R2): 0.1266349152751347


In [None]:
#Support Vector regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(encoder.fit_transform)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate a Support Vector Regressor
svr_regressor = SVR(kernel='rbf')  # You can choose different kernels like 'linear', 'poly', 'rbf', etc.

# Train the model
svr_regressor.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = svr_regressor.predict(X_test_scaled)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)



Mean Squared Error (MSE): 135514608284740.98
Normalized Mean Squared Error (NMSE): 8.532374858958274
R-squared (R2): -0.09119190868597471


In [None]:
#Artificial Neural Network
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Read the cleaned data
data = pd.read_csv('cleaned_data_no_encoding.csv')

# Separate numeric and categorical columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
data[numeric_columns] = imputer.fit_transform(data[numeric_columns])

# Perform binary encoding for categorical columns
encoder = LabelEncoder()
data[categorical_columns] = data[categorical_columns].apply(encoder.fit_transform)

# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Price'])
y = data['Price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build the ANN model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled).flatten()

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate Normalized Mean Squared Error (NMSE)
y_mean = y_test.mean()
nmse = ((y_test - y_pred) ** 2).mean() / y_mean ** 2
print("Normalized Mean Squared Error (NMSE):", nmse)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared (R2):", r_squared)



Mean Squared Error (MSE): 128022998936902.08
Normalized Mean Squared Error (NMSE): 8.060682396708541
R-squared (R2): -0.030867906669739487
