In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sqlalchemy import create_engine
from scipy.optimize import nnls

In [2]:
# Connection string to the PostgreSQL database
connection_string = 'postgresql://postgres:140494@localhost:5433/vehicle4'

In [3]:
# Create a SQLAlchemy engine to connect to the database
engine = create_engine(connection_string)

In [4]:
# SQL query to fetch the data from the database table
query = 'SELECT * FROM vehicle_listings;' 
# Read the data from the database into a pandas DataFrame
vehicle_df = pd.read_sql(query, engine)

In [5]:
# Close the database connection
engine.dispose()

In [6]:
# Preprocess the data - select relevant features
X = vehicle_df[['Province', 'Year', 'Make', 'Model', 'Mileage']]
y = vehicle_df['Price']

In [8]:
# Use LabelEncoder to convert categorical columns to numerical values
label_encoder = LabelEncoder()
X['Province'] = label_encoder.fit_transform(X['Province'])
X['Make'] = label_encoder.fit_transform(X['Make'])
X['Model'] = label_encoder.fit_transform(X['Model'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Province'] = label_encoder.fit_transform(X['Province'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Make'] = label_encoder.fit_transform(X['Make'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Model'] = label_encoder.fit_transform(X['Model'])


In [9]:
X.sample(10)

Unnamed: 0,Province,Year,Make,Model,Mileage
290,1,2013,0,0,157000
304,1,2012,0,0,200000
0,3,2016,0,0,136000
57,7,2013,0,0,100000
244,0,2016,0,0,89000
156,5,2018,0,0,101000
27,7,2011,0,0,150000
240,0,2012,0,0,198000
10,3,2015,0,0,114000
147,5,2016,0,0,172000


In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [12]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler to the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [14]:
y_pred = model.predict(X_test_scaled)

In [15]:
# Create a DataFrame to display y_test and y_pred_non_negative side by side
result_df = pd.DataFrame({'Actual Price (y_test)': y_test, 'Predicted Price (y_pred)': y_pred})
result_df.sample(10)

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred)
126,26399.0,23139.191498
308,6000.0,1177.187809
232,25000.0,23514.435326
93,4000.0,5003.687125
77,22449.0,22818.668278
185,25995.0,23766.959779
25,17000.0,18571.409913
239,18995.0,17385.788796
75,13999.0,14470.759431
177,12800.0,13596.831714


In [16]:
# Get only the rows where 'Predicted Price (y_pred)' is negative
negative_predictions_df = result_df[result_df['Predicted Price (y_pred)'] < 0]

# Display the DataFrame with negative predicted prices
negative_predictions_df

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred)
42,5000.0,-3708.276625


In [17]:
# Calculate the Mean Squared Error to evaluate the model's performance
mse = mean_squared_error(result_df['Actual Price (y_test)'], result_df['Predicted Price (y_pred)'])
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 17548935.361846693


In [18]:
# Calculate the percentage-wise absolute difference for each row
result_df['Percentage-wise Absolute Difference'] = (abs(result_df['Actual Price (y_test)'] - result_df['Predicted Price (y_pred)']) / result_df['Actual Price (y_test)']) * 100

# Sort the DataFrame based on the percentage-wise absolute difference in ascending order
sorted_result_df = result_df.sort_values(by='Percentage-wise Absolute Difference', ascending=False)

# Display the sorted DataFrame
sorted_result_df

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred),Percentage-wise Absolute Difference
108,1111.0,23609.538539,2025.070976
202,2000.0,15948.566921,697.428346
42,5000.0,-3708.276625,174.165533
46,2500.0,4741.957285,89.678291
308,6000.0,1177.187809,80.380203
...,...,...,...
114,20700.0,20964.212000,1.276386
76,21990.0,21775.273564,0.976473
175,13499.0,13376.544322,0.907146
184,24500.0,24325.496890,0.712258


In [19]:
#Show only lines that are over 25% difference
over25_df = sorted_result_df[sorted_result_df['Percentage-wise Absolute Difference']>25]

#Count how many lines are over 25% difference
over25_df.count()

Actual Price (y_test)                  19
Predicted Price (y_pred)               19
Percentage-wise Absolute Difference    19
dtype: int64

In [20]:
#Show only lines that are under 15% difference
under15_df = sorted_result_df[sorted_result_df['Percentage-wise Absolute Difference']<15]

#Count how many lines are under 15% difference
under15_df.count()

Actual Price (y_test)                  49
Predicted Price (y_pred)               49
Percentage-wise Absolute Difference    49
dtype: int64