# Regression Model

In [22]:
# Imports
import pandas as pd
from data_cleaning import clean_car_data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import boxcox

In [23]:
# Import the Data
cars_raw = pd.read_csv("USA_cars_datasets.csv")

In [24]:
# Clean the data
cars = clean_car_data(cars_raw)

In [32]:
# One-hot Encoding for categorical variables

# Determine categorical columns
category_columns = cars.select_dtypes('category').columns

# Drop categorical columns
cars = pd.get_dummies(cars, columns=category_columns, drop_first=True)

In [33]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Index: 745 entries, 9 to 2203
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   price             745 non-null    int64
 1   mileage           745 non-null    int64
 2   age               745 non-null    int64
 3   model_cab         745 non-null    bool 
 4   model_connect     745 non-null    bool 
 5   model_cutaway     745 non-null    bool 
 6   model_d           745 non-null    bool 
 7   model_dr          745 non-null    bool 
 8   model_drw         745 non-null    bool 
 9   model_ecosport    745 non-null    bool 
 10  model_edge        745 non-null    bool 
 11  model_el          745 non-null    bool 
 12  model_energi      745 non-null    bool 
 13  model_escape      745 non-null    bool 
 14  model_expedition  745 non-null    bool 
 15  model_explorer    745 non-null    bool 
 16  model_f-150       745 non-null    bool 
 17  model_f-650       745 non-null    bool 

In [34]:
# Separate the features and target variable
x = cars.drop(columns='price')
y = cars['price']

In [35]:
# Standardize the numeric variables
scaler = MinMaxScaler()

# Identify numeric columns
number_columns = x.select_dtypes('number').columns

# Standardize numeric features
x[number_columns] = scaler.fit_transform(cars[number_columns])

In [36]:
# Partition the Data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=42)

In [9]:
# Implement Cross-Validation

In [37]:
# Make a Regression Model
model = LinearRegression()
model.fit(x_train, y_train)

In [38]:
# Predict values using the model
pred = model.predict(x_train)

In [39]:
# Calculate model metrics
coefficients = model.coef_
intercept = model.intercept_

In [40]:
# Create a coefficient table
pd.DataFrame({"Intercept": intercept, "Coefficient": coefficients}, index=x.columns)

Unnamed: 0,Intercept,Coefficient
mileage,36508.728127,-16713.27
age,36508.728127,-13080.61
model_cab,36508.728127,1626.465
model_connect,36508.728127,-1615.31
model_cutaway,36508.728127,-2651.614
model_d,36508.728127,-17774.17
model_dr,36508.728127,1.273293e-11
model_drw,36508.728127,21959.37
model_ecosport,36508.728127,-15286.67
model_edge,36508.728127,-4618.38
