In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sqlalchemy import create_engine

In [2]:
# Connection string to the PostgreSQL database
connection_string = 'postgresql://postgres:140494@localhost:5433/vehicle4'

In [3]:
# Create a SQLAlchemy engine to connect to the database
engine = create_engine(connection_string)

In [4]:
# SQL query to fetch the data from the database table
query = 'SELECT * FROM vehicle_listings;'  
# Read the data from the database into a pandas DataFrame
vehicle_df = pd.read_sql(query, engine)

vehicle_df.head()

Unnamed: 0,id,City,Province,Year,Make,Model,Price,Mileage,URL
0,1,Sussex,NB,2016,Honda,Civic,16777.0,136000,facebook.com/marketplace/item/834601531421503/...
1,2,Shediac,NB,2017,Honda,Civic,24000.0,138000,facebook.com/marketplace/item/192129936942166/...
2,3,Balmoral,NB,2018,Honda,Civic,13900.0,175000,facebook.com/marketplace/item/294296019846936/...
3,4,Halifax,NS,2008,Honda,Civic,8475.0,176000,facebook.com/marketplace/item/980612973081374/...
4,5,Charlottetown,PE,2016,Honda,Civic,16995.0,175000,facebook.com/marketplace/item/565499695587166/...


In [5]:
X = vehicle_df[['City', 'Province', 'Year', 'Make', 'Model', 'Mileage']]
y = vehicle_df['Price']

In [6]:
# Use pd.get_dummies() to convert 'City', 'Province', 'Make', 'Model'  columns to numerical values
X = pd.get_dummies(X)

X.head()

Unnamed: 0,Year,Mileage,City_Abbotsford,City_Ajax,City_Altona,City_Amherst,City_Baie-Comeau,City_Balmoral,City_Beaumont,City_Blainville,...,Province_BC,Province_MB,Province_NB,Province_NS,Province_ON,Province_PE,Province_QC,Province_SK,Make_Honda,Model_Civic
0,2016,136000,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,True
1,2017,138000,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,True
2,2018,175000,False,False,False,False,False,True,False,False,...,False,False,True,False,False,False,False,False,True,True
3,2008,176000,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,True,True
4,2016,175000,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,True


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler to the training data and transform both training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

LinearRegression()

In [10]:
y_pred = model.predict(X_test_scaled)

In [11]:
# Create a DataFrame to display y_test and y_pred side by side
result_df = pd.DataFrame({'Actual Price (y_test)': y_test, 'Predicted Price (y_pred)': y_pred})
result_df.sample(20)

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred)
308,6000.0,928.4459
5,2500.0,1506.298
108,1111.0,21953.45
238,20990.0,16808.45
316,24728.0,21348.45
76,21990.0,21042.3
93,4000.0,4065.446
33,6700.0,6018.298
31,12990.0,12322.3
185,25995.0,26330.45


In [12]:
# Get only the rows where 'Predicted Price (y_pred)' is negative
negative_predictions_df = result_df[result_df['Predicted Price (y_pred)'] < 0]

# Display the DataFrame with negative predicted prices
negative_predictions_df

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred)
202,2000.0,-3.270218e+16
42,5000.0,-2381.702
7,12499.0,-3.532181e+17
140,10750.0,-3.180892e+16
242,11500.0,-6.649861e+16
17,29988.0,-2.770002e+17


In [13]:
# Calculate the Mean Squared Error to evaluate the model's performance
mse = mean_squared_error(result_df['Actual Price (y_test)'], result_df['Predicted Price (y_pred)'])
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 3.2552633973268354e+33


In [14]:
# Calculate the percentage-wise absolute difference for each row
result_df['Percentage-wise Absolute Difference'] = (abs(result_df['Actual Price (y_test)'] - result_df['Predicted Price (y_pred)']) / result_df['Actual Price (y_test)']) * 100

# Sort the DataFrame based on the percentage-wise absolute difference in ascending order
sorted_result_df = result_df.sort_values(by='Percentage-wise Absolute Difference', ascending=False)

# Display the sorted DataFrame
sorted_result_df

Unnamed: 0,Actual Price (y_test),Predicted Price (y_pred),Percentage-wise Absolute Difference
7,12499.0,-3.532181e+17,2.825971e+15
24,4450.0,9.513045e+16,2.137763e+15
202,2000.0,-3.270218e+16,1.635109e+15
17,29988.0,-2.770002e+17,9.237036e+14
213,28500.0,1.657274e+17,5.814995e+14
...,...,...,...
109,17999.0,1.831345e+04,1.747019e+00
93,4000.0,4.065446e+03,1.636149e+00
185,25995.0,2.633045e+04,1.290425e+00
260,5400.0,5.348446e+03,9.547047e-01


In [15]:
#Show only lines that are over 25% difference
over25_df = sorted_result_df[sorted_result_df['Percentage-wise Absolute Difference']>25]

#Count how many lines are over 25% difference
over25_df.count()

Actual Price (y_test)                  27
Predicted Price (y_pred)               27
Percentage-wise Absolute Difference    27
dtype: int64

In [16]:
#Show only lines that are under 15% difference
under15_df = sorted_result_df[sorted_result_df['Percentage-wise Absolute Difference']<15]

#Count how many lines are under 15% difference
under15_df.count()

Actual Price (y_test)                  44
Predicted Price (y_pred)               44
Percentage-wise Absolute Difference    44
dtype: int64