In [1]:
# Imports and path setup
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

ROOT = Path.cwd()
DATA_PROCESSED = ROOT / 'data' / 'processed'
MODELS = ROOT / 'models'
OUTPUTS = ROOT / 'outputs'
RESULTS = OUTPUTS / 'results'
GRAPHS = OUTPUTS / 'graphs'
for p in [DATA_PROCESSED, MODELS, RESULTS, GRAPHS]:
    p.mkdir(parents=True, exist_ok=True)

X_TRAIN_IN = DATA_PROCESSED / 'X_train.csv'
X_TEST_IN = DATA_PROCESSED / 'X_test.csv'
Y_TRAIN_IN = DATA_PROCESSED / 'y_train.csv'
Y_TEST_IN = DATA_PROCESSED / 'y_test.csv'
MODEL_OUT = MODELS / 'car_price_model.pkl'
METRICS_OUT = RESULTS / 'model_metrics.csv'
IMPORTANCES_OUT = RESULTS / 'feature_importances.csv'
PREDICTIONS_OUT = RESULTS / 'predictions_test.csv'

print('Training input:', X_TRAIN_IN)

Training input: d:\Livstream\ Car Price Prediction with Machine Learning\notebooks\data\processed\X_train.csv


In [2]:
# Load train/test data with error handling
try:
    X_train = pd.read_csv(X_TRAIN_IN)
    X_test = pd.read_csv(X_TEST_IN)
    y_train = pd.read_csv(Y_TRAIN_IN).squeeze()
    y_test = pd.read_csv(Y_TEST_IN).squeeze()
    print('Loaded shapes - X_train, X_test, y_train, y_test:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
except FileNotFoundError as e:
    sys.exit(f'ERROR: Required train/test files missing: {e}')
except Exception as e:
    sys.exit(f'ERROR loading train/test files: {e}')

# Basic sanity checks
if X_train.shape[0] != y_train.shape[0]:
    sys.exit('ERROR: Mismatch between X_train and y_train row counts')
if X_test.shape[0] != y_test.shape[0]:
    sys.exit('ERROR: Mismatch between X_test and y_test row counts')

Loaded shapes - X_train, X_test, y_train, y_test: (239, 53) (60, 53) (239,) (60,)


In [3]:
# Train RandomForestRegressor
try:
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    print('Model training complete')
except Exception as e:
    sys.exit(f'ERROR during model training: {e}')

# Save trained model
try:
    with open(MODEL_OUT, 'wb') as f:
        pickle.dump(model, f)
    print('Saved trained model to', MODEL_OUT)
except Exception as e:
    print('Could not save model:', e)

SystemExit: ERROR during model training: could not convert string to float: 'ciaz'

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# Predict and evaluate on test set
try:
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    mse = mean_squared_error(y_test, preds)
    metrics_df = pd.DataFrame([{'r2': r2, 'mae': mae, 'mse': mse}])
    metrics_df.to_csv(METRICS_OUT, index=False)
    print('Saved metrics to', METRICS_OUT)
    display(metrics_df)
except Exception as e:
    sys.exit(f'ERROR during prediction/evaluation: {e}')

In [None]:
# Save predictions with true values for inspection
try:
    pred_df = pd.DataFrame({'y_true': y_test, 'y_pred': preds})
    pred_df.to_csv(PREDICTIONS_OUT, index=False)
    print('Saved test predictions to', PREDICTIONS_OUT)
except Exception as e:
    print('Could not save predictions:', e)

In [None]:
# Feature importances (if feature names available)
try:
    fi = model.feature_importances_
    features = X_train.columns.tolist()
    fi_df = pd.DataFrame({'feature': features, 'importance': fi}).sort_values('importance', ascending=False)
    fi_df.to_csv(IMPORTANCES_OUT, index=False)
    print('Saved feature importances to', IMPORTANCES_OUT)
except Exception as e:
    print('Could not compute/save feature importances:', e)

In [None]:
# Plots: predicted vs actual and residuals
try:
    plt.figure(figsize=(7,5))
    plt.scatter(y_test, preds, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual Selling_Price')
    plt.ylabel('Predicted Selling_Price')
    plt.title('Predicted vs Actual (Test)')
    out1 = GRAPHS / 'predicted_vs_actual.png'
    plt.tight_layout()
    plt.savefig(out1, dpi=150)
    plt.show()
    print('Saved plot to', out1)

    residuals = y_test - preds
    plt.figure(figsize=(7,4))
    plt.hist(residuals, bins=40, edgecolor='k')
    plt.title('Residuals Distribution (Test)')
    plt.xlabel('Residual')
    out2 = GRAPHS / 'residuals_histogram.png'
    plt.tight_layout()
    plt.savefig(out2, dpi=150)
    plt.show()
    print('Saved plot to', out2)
except Exception as e:
    print('Could not create/save plots:', e)

## End of Training

Saved outputs:
- `models/car_price_model.pkl` (trained RandomForestRegressor)
- `outputs/results/model_metrics.csv` (R2, MAE, MSE)
- `outputs/results/feature_importances.csv`
- `outputs/results/predictions_test.csv`
- Plots in `outputs/graphs/`: `predicted_vs_actual.png`, `residuals_histogram.png`

Next: run `05_model_evaluation.ipynb` to produce additional evaluation plots and a final report.