In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib

In [None]:
# --- 1. LOAD & MERGE DATA ---
# Assuming you have downloaded the files from Kaggle
# You might need to change the file paths depending on where you saved them
try:
    audi = pd.read_csv('audi.csv'); audi['brand'] = 'Audi'
    bmw = pd.read_csv('bmw.csv'); bmw['brand'] = 'BMW'
    ford = pd.read_csv('ford.csv'); ford['brand'] = 'Ford'
    vw = pd.read_csv('vw.csv'); vw['brand'] = 'Volkswagen'
    toyota = pd.read_csv('toyota.csv'); toyota['brand'] = 'Toyota'
    merc = pd.read_csv('merc.csv'); merc['brand'] = 'Mercedes'
    
    # Combine them all into one big dataset
    df = pd.concat([audi, bmw, ford, vw, toyota, merc], axis=0)
    print(f"Dataset loaded successfully with {len(df)} cars!")

except FileNotFoundError:
    print("Error: CSV files not found. Make sure separate brand files (audi.csv, etc.) are in the folder.")
    # Stop here if you don't have data
    raise

In [4]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,Audi
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,Audi
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,Audi
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,Audi
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,Audi


In [6]:
df.tail()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,brand
13114,C Class,2020,35999,Automatic,500,Diesel,145,55.4,2.0,Mercedes
13115,B Class,2020,24699,Automatic,2500,Diesel,145,55.4,2.0,Mercedes
13116,GLC Class,2019,30999,Automatic,11612,Diesel,145,41.5,2.1,Mercedes
13117,CLS Class,2019,37990,Automatic,2426,Diesel,145,45.6,2.0,Mercedes
13118,S Class,2019,54999,Automatic,2075,Diesel,145,52.3,2.9,Mercedes


In [None]:
# --- 2. CLEANING ---
# Drop rows with errors
df = df.dropna()

# --- 3. DEFINE FEATURES ---
# UK Dataset columns: model, year, price, transmission, mileage, fuelType, tax, mpg, engineSize
X = df[['brand', 'model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize']]
y = df['price'] # This is in Pounds (£)

# --- 4. CREATE PIPELINE (OPTIMIZED) ---
categorical_features = ['brand', 'model', 'transmission', 'fuelType']
numerical_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        # sparse_output=True is the default and SAVES MEMORY
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features) 
    ])

# We use 50 trees (balance between 100 and 30)
# We limit max_depth to 20 to prevent "infinite" growth
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=50,      
        max_depth=20,         
        random_state=42,
        n_jobs=-1
    ))
])

# --- 5. TRAIN ---
print("Training Optimized Model...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

print(f"Model Training Complete. Accuracy: {pipeline.score(X_test, y_test):.4f}")

In [16]:
# --- 6. SAVE MODEL (USING JOBLIB) ---
# COMPRESS=3 helps reduce file size significantly
try:
    joblib.dump(pipeline, 'uk_car_model.pkl', compress=3)
    print("✅ Success! Model saved as 'uk_car_model.pkl'")
except Exception as e:
    print(f"Error saving: {e}")

✅ Success! Model saved as 'uk_car_model.pkl'
