In [2]:
#importing the relevant libraries:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split # for splitting the data into training and testing sets
from sklearn.linear_model import LinearRegression # models we are going to use
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error # for comparing the predicted and test values
import seaborn as sns

In [4]:
df = pd.read_csv('car_data.csv')
df = df.drop('vin', axis=1)

# Split the 'saledate' column into parts
parts = df['saledate'].str.split(' ', expand=True)

# Assign each part to a new column
df['dow'] = parts[0]  # "Tue"
df['date'] = parts[1] + ' ' + parts[2] + ' ' + parts[3]  # "Dec 16 2014"
df['time'] = parts[4]  # "12:30:00"
df['GMT'] = parts[5]  # "GMT-0800"
df['GMT'] = df['GMT'].str.replace('GMT-', '', regex=False)
df['timezone'] = parts[6].str.strip('()')  # "PST" without the parentheses
df = df.drop(['saledate', 'timezone'], axis=1)
#converting all the entries to lowercase
df['make'] = df['make'].str.lower()
df['body'] = df['body'].str.lower()
df['model'] = df['model'].str.lower()
#creating a new column "difference" (selling price - mmr)
df['difference'] = df['sellingprice']-df['mmr']
#dropping entries where no values exist 
df = df.dropna()
#dropping any rows if the "conditions" variable has a 0 entry
cleaned_df = df[df["condition"] != 0]

In [6]:
#multivariate linear regression

# Define features and target variable
X = cleaned_df[['year', 'odometer', 'condition','mmr']]
y = cleaned_df['sellingprice']

# Split data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=np.random.RandomState(31287)
)

# Instantiate and fit the model
linear = LinearRegression()
linear.fit(X_train, Y_train)

# Get predictions for training and testing data
training_predictions = linear.predict(X_train)
testing_predictions = linear.predict(X_test)

# Create a table of the various scores
pd.DataFrame({
    "R^2": {
        "train": r2_score(Y_train, training_predictions),
        "test": r2_score(Y_test, testing_predictions)
    },
    "RMSE": {
        "train": mean_squared_error(Y_train, training_predictions, squared=False),
        "test": mean_squared_error(Y_test, testing_predictions, squared=False),
    },
    "MAE": {
        "train": mean_absolute_error(Y_train, training_predictions),
        "test": mean_absolute_error(Y_test, testing_predictions),
    },
})

Unnamed: 0,R^2,RMSE,MAE
train,0.969275,1685.195059,1044.834592
test,0.971178,1631.373731,1043.920997


In [7]:
feature_names = X.columns.tolist() 
# the regression coefficient values
coefficients = pd.Series(data=linear.coef_.transpose(), index=feature_names)
coefficients

year        -42.633078
odometer     -0.001016
condition    37.318204
mmr           0.984331
dtype: float64

In [8]:
#The regression intercept
intercept=np.round(linear.intercept_,3)
intercept

84690.925