In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sqlalchemy import create_engine
from scipy.optimize import nnls

In [2]:
# Connection string to the PostgreSQL database
connection_string = 'postgresql://postgres:140494@localhost:5433/vehicle4'

In [3]:
# Create a SQLAlchemy engine to connect to the database
engine = create_engine(connection_string)

In [4]:
# SQL query to fetch the data from the database table
query = 'SELECT * FROM vehicle_listings;' 
# Read the data from the database into a pandas DataFrame
vehicle_df = pd.read_sql(query, engine)

In [5]:
# Close the database connection
engine.dispose()

In [6]:
# Preprocess the data - select relevant features
X = vehicle_df[['City', 'Province', 'Year', 'Make', 'Model', 'Mileage']]
y = vehicle_df['Price']

In [7]:
# Use LabelEncoder to convert categorical columns to numerical values
label_encoder = LabelEncoder()
X['City'] = label_encoder.fit_transform(X['City'])
X['Province'] = label_encoder.fit_transform(X['Province'])
X['Make'] = label_encoder.fit_transform(X['Make'])
X['Model'] = label_encoder.fit_transform(X['Model'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['City'] = label_encoder.fit_transform(X['City'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Province'] = label_encoder.fit_transform(X['Province'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Make'] = label_encoder.fit_transform(X['Make'])
A value is trying to be set on a copy of a s

In [8]:
X.sample(10)

Unnamed: 0,City,Province,Year,Make,Model,Mileage
32,75,7,2009,0,0,156000
230,13,0,2006,0,0,145000
130,8,5,2020,0,0,148000
117,8,5,2018,0,0,87000
37,45,7,2017,0,0,57000
289,83,1,2012,0,0,102000
93,82,5,2007,0,0,182000
66,36,7,2014,0,0,102000
121,1,5,2008,0,0,178000
277,83,1,2008,0,0,85000


In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler to the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [12]:
y_pred = model.predict(X_test_scaled)

In [13]:
# Create a DataFrame to display y_test and y_pred_non_negative side by side
result_df = pd.DataFrame({'Actual Price (y_test)': y_test, 'Predicted Price (y_pred)': y_pred})
result_df.sample(10)

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred)
260,5400.0,6095.842388
57,13995.0,14300.615628
213,28500.0,19513.480023
7,12499.0,13969.760001
218,16900.0,16685.63363
137,25000.0,18880.079445
220,16500.0,15690.530317
132,7000.0,9193.188585
177,12800.0,14268.580844
76,21990.0,21615.293258


In [14]:
# Get only the rows where 'Predicted Price (y_pred)' is negative
negative_predictions_df = result_df[result_df['Predicted Price (y_pred)'] < 0]

# Display the DataFrame with negative predicted prices
negative_predictions_df

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred)
42,5000.0,-4306.938479


In [15]:
# Calculate the Mean Squared Error to evaluate the model's performance
mse = mean_squared_error(result_df['Actual Price (y_test)'], result_df['Predicted Price (y_pred)'])
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 17691445.057546675


In [16]:
# Calculate the percentage-wise absolute difference for each row
result_df['Percentage-wise Absolute Difference'] = (abs(result_df['Actual Price (y_test)'] - result_df['Predicted Price (y_pred)']) / result_df['Actual Price (y_test)']) * 100

# Sort the DataFrame based on the percentage-wise absolute difference in ascending order
sorted_result_df = result_df.sort_values(by='Percentage-wise Absolute Difference', ascending=False)

# Display the sorted DataFrame
sorted_result_df

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred),Percentage-wise Absolute Difference
108,1111.0,24030.425582,2062.954598
202,2000.0,15140.988251,657.049413
42,5000.0,-4306.938479,186.138770
46,2500.0,4737.976525,89.519061
308,6000.0,1664.886682,72.251889
...,...,...,...
218,16900.0,16685.633630,1.268440
258,13000.0,12869.985207,1.000114
222,16000.0,16151.461197,0.946632
118,12299.0,12196.758502,0.831299


In [17]:
#Show only lines that are over 25% difference
over25_df = sorted_result_df[sorted_result_df['Percentage-wise Absolute Difference']>25]

#Count how many lines are over 25% difference
over25_df.count()

Actual Price (y_test)                  18
Predicted Price (y_pred)               18
Percentage-wise Absolute Difference    18
dtype: int64

In [18]:
#Show only lines that are under 15% difference
under15_df = sorted_result_df[sorted_result_df['Percentage-wise Absolute Difference']<15]

#Count how many lines are under 15% difference
under15_df.count()

Actual Price (y_test)                  50
Predicted Price (y_pred)               50
Percentage-wise Absolute Difference    50
dtype: int64