In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# data load - Using your cleaned car dataset as alternative to house dataset
df = pd.read_csv("Car_Clean.csv")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())
print("\nColumn information:")
print(df.info())

Dataset shape: (140, 15)

First 5 rows:
    Price  Odometer_km     Doors  Accidents      Year  Location_City  \
0  1500.0     0.128390  0.254091   0.316968  1.686714              1   
1  4171.0    -0.044709  0.254091  -0.820867 -0.794617              0   
2  5331.0    -0.440923  0.254091  -0.820867 -0.518913              0   
3  1500.0     0.203135  0.254091   0.316968  1.548862              0   
4  1500.0    -0.044709 -0.931668  -0.820867 -1.621727              1   

   Location_Rural  Location_Subrb  Price_per_km  Had_Accident  Small_Car  \
0               0               0     -0.233324             1          0   
1               1               0     -0.133663             0          0   
2               0               1     -0.054008             0          0   
3               0               1     -0.234745             1          0   
4               0               0     -0.229692             0          1   

   Medium_Car  Large_Car  Is_City  LogPrice  
0           1          0

In [3]:
# Prepare Features & Target
# Target (y) = Price
# Features (X) = all other columns except Price and LogPrice

# Define target variable
y = df['Price']

# Define features (exclude Price and LogPrice)
exclude_columns = ['Price', 'LogPrice']
X = df.drop(columns=exclude_columns)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeatures: {list(X.columns)}")

Features shape: (140, 13)
Target shape: (140,)

Features: ['Odometer_km', 'Doors', 'Accidents', 'Year', 'Location_City', 'Location_Rural', 'Location_Subrb', 'Price_per_km', 'Had_Accident', 'Small_Car', 'Medium_Car', 'Large_Car', 'Is_City']


In [4]:
# Split Data
# Split into 80% training and 20% testing
# Use random_state=42 for reproducibility

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Training split: {len(X_train)/(len(X_train)+len(X_test)):.1%}")
print(f"Test split: {len(X_test)/(len(X_train)+len(X_test)):.1%}")

Training samples: 112
Test samples: 28
Training split: 80.0%
Test split: 20.0%


In [5]:
# Train Models

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Train Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

print('✅ Models trained successfully!')
print(f"Linear Regression trained on {len(X_train)} samples")
print(f"Random Forest trained on {len(X_train)} samples")

✅ Models trained successfully!
Linear Regression trained on 112 samples
Random Forest trained on 112 samples


In [6]:
# Helper function to evaluate model performance
def evaluate_model(y_true, y_pred, model_name):
    """
    Calculate and print model performance metrics
    """
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    
    print(f"{model_name} Performance:")
    print(f"  R²   : {r2:.3f}")
    print(f"  MAE  : {mae:,.0f}")
    print(f"  MSE  : {mse:,.0f}")
    print(f"  RMSE : {rmse:,.0f}")
    print()
    
    return {'R2': r2, 'MAE': mae, 'MSE': mse, 'RMSE': rmse}

In [7]:
# Evaluate Performance
# Make predictions on test set
lr_predictions = lr_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

# Print performance metrics for both models
print('=' * 50)
print('MODEL PERFORMANCE COMPARISON')
print('=' * 50)

lr_metrics = evaluate_model(y_test, lr_predictions, 'Linear Regression')
rf_metrics = evaluate_model(y_test, rf_predictions, 'Random Forest')

# Compare models
print('COMPARISON:')
if rf_metrics['R2'] > lr_metrics['R2']:
    print(f"🏆 Random Forest has better R² ({rf_metrics['R2']:.3f} vs {lr_metrics['R2']:.3f})")
else:
    print(f"🏆 Linear Regression has better R² ({lr_metrics['R2']:.3f} vs {rf_metrics['R2']:.3f})")

if rf_metrics['MAE'] < lr_metrics['MAE']:
    print(f"🏆 Random Forest has lower MAE ({rf_metrics['MAE']:.0f} vs {lr_metrics['MAE']:.0f})")
else:
    print(f"🏆 Linear Regression has lower MAE ({lr_metrics['MAE']:.0f} vs {rf_metrics['MAE']:.0f})")

MODEL PERFORMANCE COMPARISON
Linear Regression Performance:
  R²   : 0.515
  MAE  : 1,300
  MSE  : 3,226,652
  RMSE : 1,796

Random Forest Performance:
  R²   : 0.814
  MAE  : 562
  MSE  : 1,241,031
  RMSE : 1,114

COMPARISON:
🏆 Random Forest has better R² (0.814 vs 0.515)
🏆 Random Forest has lower MAE (562 vs 1300)


In [8]:
# Single-row Sanity Check
# Pick one row from the test set and compare actual price with predictions

# Select a random test sample
sample_idx = 5  # You can change this index
sample_features = X_test.iloc[[sample_idx]]
actual_price = y_test.iloc[sample_idx]

# Make predictions for this single sample
lr_single_pred = lr_model.predict(sample_features)[0]
rf_single_pred = rf_model.predict(sample_features)[0]

print('=' * 60)
print('SINGLE SAMPLE PREDICTION COMPARISON')
print('=' * 60)

print(f"Sample features:")
for feature, value in sample_features.iloc[0].items():
    print(f"  {feature}: {value}")

print(f"\n💰 ACTUAL PRICE: ${actual_price:,.2f}")
print(f"🤖 Linear Regression Prediction: ${lr_single_pred:,.2f}")
print(f"🌲 Random Forest Prediction: ${rf_single_pred:,.2f}")

print(f"\nPrediction Errors:")
lr_error = abs(actual_price - lr_single_pred)
rf_error = abs(actual_price - rf_single_pred)
print(f"  Linear Regression Error: ${lr_error:,.2f}")
print(f"  Random Forest Error: ${rf_error:,.2f}")

if rf_error < lr_error:
    print(f"\n🏆 Random Forest is more accurate for this sample!")
else:
    print(f"\n🏆 Linear Regression is more accurate for this sample!")

SINGLE SAMPLE PREDICTION COMPARISON
Sample features:
  Odometer_km: -0.898399628492121
  Doors: 0.2540913424931785
  Accidents: 0.3169684327872081
  Year: 0.03249361724608
  Location_City: 1.0
  Location_Rural: 0.0
  Location_Subrb: 0.0
  Price_per_km: -0.0813819121467454
  Had_Accident: 1.0
  Small_Car: 0.0
  Medium_Car: 1.0
  Large_Car: 0.0
  Is_City: 1.0

💰 ACTUAL PRICE: $3,622.00
🤖 Linear Regression Prediction: $4,422.06
🌲 Random Forest Prediction: $4,737.43

Prediction Errors:
  Linear Regression Error: $800.06
  Random Forest Error: $1,115.43

🏆 Linear Regression is more accurate for this sample!


In [9]:
# Additional Analysis: Feature Importance (Random Forest)
print('=' * 50)
print('RANDOM FOREST FEATURE IMPORTANCE')
print('=' * 50)

# Get feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print('Top 10 Most Important Features:')
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"{i:2}. {row['feature']:<20} {row['importance']:.4f}")

print(f"\nTotal features used: {len(X.columns)}")

RANDOM FOREST FEATURE IMPORTANCE
Top 10 Most Important Features:
 1. Price_per_km         0.9454
 2. Year                 0.0251
 3. Odometer_km          0.0166
 4. Doors                0.0023
 5. Accidents            0.0018
 6. Medium_Car           0.0016
 7. Location_Rural       0.0016
 8. Small_Car            0.0013
 9. Location_Subrb       0.0011
10. Had_Accident         0.0011

Total features used: 13
