In [None]:
# Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
df_full = pd.read_csv('used_clean.csv')

# Review the DataFrame
df_full.head()

In [None]:
col = ['price', 'miles', 'year', 'make', 'model', 'body_type',
       'vehicle_type', 'drivetrain', 'transmission', 'fuel_type',
       'engine_size', 'engine_block', 'state']

df = df_full[col].copy()

In [None]:
df.columns

In [None]:
columns = ['price', 'miles', 'year', 'make', 'model', 'body_type',
       'vehicle_type', 'drivetrain', 'transmission', 'fuel_type',
       'engine_size', 'engine_block', 'state']
for col in columns:
    print(col)
    print(df[col].nunique())
    print(df[col].value_counts())

In [None]:
df["engine_size"] = df["engine_size"].apply(lambda x: 0 if x == 'E' else x)
df.head()

In [None]:
df.engine_size = df.engine_size.astype(float)

In [None]:
def label_engine_size(engine):
    if 0.1 <= engine < 2:
        return 1
    elif 2 <= engine < 3:
        return 2
    elif 3 <= engine < 4:
        return 3
    elif 4 <= engine < 5:
        return 4
    elif 5 <= engine < 6:
        return 5
    elif 6 <= engine < 7:
        return 6
    elif 7 <= engine < 8:
        return 7
    else:
        return 0

# Apply the custom function to the 'fuel_type' column
df['engine_size'] = df['engine_size'].apply(label_engine_size)

# Show the modified DataFrame
df.head(10)

In [None]:
df.engine_size.value_counts()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
['price', 'miles', 'year', 'make', 'model' 'body_type',
       'vehicle_type', 'drivetrain', 'transmission', 'fuel_type',
       'engine_size', 'engine_block', 'city', 'state']
# Encoding, separate out features
meta = ['price']
num_features = ['miles', 'year', 'engine_size']
cat_features = ['make', 'model', 'body_type', 'vehicle_type', 'drivetrain', 'transmission', 'fuel_type', 'engine_block', 'state']

In [None]:
# NUMERIC FIRST - Do we need a SCALER?
df_num = df.loc[:, num_features]
df_num.describe()

In [None]:
# YES we need a scaler

# initialize
scaler = StandardScaler()

# fit
scaler.fit(df_num)

# predict/transform
scaled_data = scaler.transform(df_num)
df_scaled = pd.DataFrame(scaled_data, columns=num_features)

df_scaled.head()

In [None]:
# Categorical Up Next
df_cat = df.loc[:, cat_features]
df_cat.head()

In [None]:
df_cat["vehicle_type"] = df_cat.vehicle_type.apply(lambda x: 0 if x == "Truck" else 1)
df_cat.head()

In [None]:
df_cat["transmission"] = df_cat.transmission.apply(lambda x: 0 if x == "Automatic" else 1)
df_cat.head()

In [None]:
cutoff = 1000
application_types_to_replace = df_cat.model.value_counts()[df_cat.model.value_counts() < cutoff].index.tolist()

# Replace in dataframe
for app in application_types_to_replace:
    df_cat['model'] = df_cat['model'].replace(app,"Other")

# Check to make sure replacement was successful
df_cat['model'].value_counts()

In [None]:
df_cat.head()

In [None]:
#cutoff = 500
#application_types_to_replace = df_cat.trim.value_counts()[df_cat.trim.value_counts() < cutoff].index.tolist()

# Replace in dataframe
#for app in application_types_to_replace:
    #df_cat['trim'] = df_cat['trim'].replace(app,"Other")

# Check to make sure replacement was successful
#df_cat['trim'].value_counts()

In [None]:
def label_fuel_type(fuel):
    fuel = fuel.lower()  # Convert to lowercase for consistent matching
    if 'diesel' in fuel:
        return 'Diesel'
    elif 'electric' in fuel and '/' in fuel:
        return 'Hybrid'
    elif fuel == 'electric':
        return 'Electric'
    else:
        return 'Gas'

# Apply the custom function to the 'fuel_type' column
df_cat['fuel_type'] = df_cat['fuel_type'].apply(label_fuel_type)

# Show the modified DataFrame
df_cat.head(10)

In [None]:
df_cat.fuel_type.value_counts()

In [None]:
def label_body_type(body):
    if 'Car Van'  in body or 'Combi' in body:
        return 'Cargo Van'
    elif 'Targa' in body or 'Roadster' in body:
        return 'Coupe'
    elif 'Cutaway' in body or 'Chassis Cab' in body:
        return 'Pickup'
    elif 'Targa' in body:
        return 'Roadster'
    elif 'Mini Mpv' in body:
        return 'Hatchback'
    elif 'Passenger Van' in body:
        return 'Minivan'
    else:
        return body

# Apply the custom function to the 'fuel_type' column
df_cat['body_type'] = df_cat['body_type'].apply(label_body_type)

# Show the modified DataFrame
df_cat.head(10)

In [None]:
df_cat.body_type.value_counts()

In [None]:
df_cat = pd.get_dummies(df_cat, dtype=int)
df_cat.head()

In [None]:
roadster_rows = df[df['price'] == 349900]
roadster_rows

In [None]:
df_final = df.loc[:, meta]
df_final = pd.concat([df_final, df_scaled], axis=1)
df_final = pd.concat([df_final, df_cat], axis=1)
df_final.head()

In [None]:
# Get value counts for the 'price' column
price_counts = df_final['price'].value_counts()

# Filter to show only the prices with counts greater than 1
filtered_price_counts = price_counts[price_counts > 1]

# Display the filtered results
print(filtered_price_counts)

In [None]:
# Step 1: Get the data
X = df_final.drop(columns=["price"])
y = df_final.price

y = np.log(y)

X = X.astype('float32')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) # stratify=True maintains target class percentages
print(X_train.shape, X_test.shape)

In [None]:
# Function for Regression
def doRegression(model, X_train, X_test, y_train, y_test):
    # Step 3: Fit the model
    model.fit(X_train, y_train)
    
    # Step 4: Evaluate the model
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)

    # Generate metrics TRAIN
    train_r2 = r2_score(y_train, train_preds)
    train_mse = mean_squared_error(y_train, train_preds)
    train_mae = mean_absolute_error(y_train, train_preds)
    train_rmse = np.sqrt(train_mse)
    
    train_results = f"""TRAIN METRICS
    R2: {train_r2}
    MSE: {train_mse}
    RMSE: {train_rmse}
    MAE: {train_mae}
    """
    
    print(train_results)

    # Generate metrics TEST
    test_r2 = r2_score(y_test, test_preds)
    test_mse = mean_squared_error(y_test, test_preds)
    test_mae = mean_absolute_error(y_test, test_preds)
    test_rmse = np.sqrt(test_mse)
    
    test_results = f"""TEST METRICS
    R2: {test_r2}
    MSE: {test_mse}
    RMSE: {test_rmse}
    MAE: {test_mae}
    """
    
    print(test_results)

    # VISUALIZE TEST RESULTS
    # Predicted vs Actual Plot
    plt.scatter(y_test, test_preds)
    plt.plot(y_test, y_test)
    plt.title("Predicted vs Actual Plot")
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.show()

    # Residual Plot
    resids = test_preds - y_test
    plt.scatter(test_preds, resids)
    plt.hlines(0, min(test_preds), max(test_preds))
    plt.title("Residual Plot")
    plt.xlabel("Predictions")
    plt.ylabel("Residuals")
    plt.show()

In [None]:
# Step 2: Init the Model
lr = LinearRegression()

# Do Machine Learning
doRegression(lr, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
ridge = Ridge()

# Do Machine Learning
doRegression(ridge, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
lasso = Lasso()

# Do Machine Learning
doRegression(lasso, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
en = ElasticNet()

# Do Machine Learning
doRegression(en, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
dt = DecisionTreeRegressor(random_state=42)

# Do Machine Learning
doRegression(dt, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
rf = RandomForestRegressor(random_state=42)

# Do Machine Learning
doRegression(rf, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
ada = AdaBoostRegressor(random_state=42)

# Do Machine Learning
doRegression(ada, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
et = ExtraTreesRegressor(random_state=42)

# Do Machine Learning
doRegression(et, X_train, X_test, y_train, y_test)

In [None]:
# Step 2: Init the Model
gb = GradientBoostingRegressor(random_state=42)

# Do Machine Learning
doRegression(gb, X_train, X_test, y_train, y_test)