import numpy as np import pandas as pd import warnings import
matplotlib.pyplot as plt import seaborn as sns import joblib import time

from sklearn.model_selection import train_test_split, GridSearchCV from
sklearn.metrics import accuracy_score, classification_report,
confusion_matrix from sklearn.metrics import precision_score,
recall_score from sklearn.preprocessing import StandardScaler,
LabelEncoder from sklearn.ensemble import RandomForestClassifier from
sklearn.pipeline import Pipeline

warnings.filterwarnings(‘ignore’)

def print_results(results): print(“BEST PARAMS:
{}”.format(results.best_params\_)) means =
results.cv_results\_\[‘mean_test_score’\] stds =
results.cv_results\_\[‘std_test_score’\] for mean, std, params in
zip(means, stds, results.cv_results\_\[‘params’\]): print(“{} (+/-{})
for {}”.format(round(mean, 3), round(std \* 2, 3), params))

def evaluate_model(model, features, labels): start = time.time()
predictions = model.predict(features) end = time.time()

    accuracy = round(accuracy_score(labels, predictions), 3)
    precision = round(precision_score(labels, predictions, average='macro'), 3)
    recall = round(recall_score(labels, predictions, average='macro'), 3)
    latency = round((end - start) * 1000, 1)  # milliseconds

    print(f"{type(model.named_steps['model']).__name__} -- "
          f"Accuracy: {accuracy} / Precision: {precision} / Recall: {recall} / Latency: {latency}ms")

df = pd.read_csv(“processed_mpg_data.csv”)

label_encoder = LabelEncoder() df\[‘mpg_class’\] =
label_encoder.fit_transform(df\[‘mpg_class’\])

X = df.drop(columns=\[‘mpg_class’\]) y = df\[‘mpg_class’\]

X_train, X_temp, y_train, y_temp = train_test_split( X, y,
test_size=0.4, random_state=42, stratify=y ) X_val, X_test, y_val,
y_test = train_test_split( X_temp, y_temp, test_size=0.5,
random_state=42, stratify=y_temp )

print(“Train size:”, X_train.shape\[0\]) print(“Validation size:”,
X_val.shape\[0\]) print(“Test size:”, X_test.shape\[0\])

rf_pipeline = Pipeline(\[ (‘scaler’, StandardScaler()), (‘model’,
RandomForestClassifier(random_state=42))\])

rf_params = { ’model\_\_n_estimators’: \[50, 100, 200\],
’model\_\_max_depth’: \[None, 4, 8, 12\], ’model\_\_min_samples_split’:
\[2, 5, 10\] }

rf_cv = GridSearchCV(rf_pipeline, rf_params, cv=5, scoring=‘accuracy’)
rf_cv.fit(X_train, y_train)

print(“Forest Tuning Results:”) print_results(rf_cv)

joblib.dump(rf_cv.best_estimator\_, ‘RF_model_mpg.pkl’)

print(“Set Results (Random Forest):”) rf_best = rf_cv.best_estimator\_
evaluate_model(rf_best, X_val, y_val)

print(“Set Results (Random Forest):”) evaluate_model(rf_best, X_test,
y_test)

rf_predictions = rf_best.predict(X_test) print(“Report (Random Forest on
Test Set):”) print(classification_report(y_test, rf_predictions,
target_names=label_encoder.classes\_))

cm = confusion_matrix(y_test, rf_predictions) plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt=‘d’, cmap=‘Blues’,
xticklabels=label_encoder.classes\_,
yticklabels=label_encoder.classes\_) plt.xlabel(“Predicted”)
plt.ylabel(“Actual”) plt.title(“Confusion Matrix - Random Forest (Test
Set)”) plt.show()

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier( n_estimators=200, max_depth=6, \# Reduce
depth to avoid overfitting min_samples_split=5, max_features=‘sqrt’, \#
Randomly select subset of features per split random_state=42 )