In [1]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import pickle
import sys
import os

# Add the deployment folder to the system path so we can import utils
sys.path.append(os.path.abspath('../deployment'))

from utils import engineer_sharp_features, engineer_date_features

def clean_currency(x):
    """
    Converts '$8,400' string to 8400.0 float.
    NOW ROBUST: Returns None if it encounters text like 'Porsche'.
    """
    if pd.isna(x):
        return None
    if isinstance(x, str):
        # Remove symbols
        x_clean = x.replace('$', '').replace(',', '').strip()
        try:
            return float(x_clean)
        except ValueError:
            # If we can't convert it (e.g. it's text), return None
            return None
    return float(x)

def clean_mileage(x):
    """
    Converts '53,700' string to 53700.0 float.
    NOW ROBUST: Returns None if it encounters text.
    """
    if pd.isna(x):
        return None
    if isinstance(x, str):
        x_clean = x.replace(',', '').strip()
        try:
            return float(x_clean)
        except ValueError:
            return None
    return float(x)

def clean_model(x):
    if pd.isna(x):
        return x
    return x.replace('\nSave', '').strip()

# def clean_title(x):
#     if pd.isna(x):
#         return x
#     return x.split('(')[0].strip()

import pandas as pd

def clean_and_group_title(x):
    if pd.isna(x):
        return 'Unknown' 
    
    # Apply original split and strip, then lowercase to handle casing issues
    val = str(x).split('(')[0].strip().lower()
    
    # 1. Clean Titles
    if val in ['clean', 'clen']:
        return 'Clean'
    
    # 2. Rebuilt / Salvage / Restored
    if any(keyword in val for keyword in ['rebuilt', 'salvage', 'reconstructed', 'totaled', 'restored']):
        return 'Rebuilt/Salvage'
        
    # 3. Mileage & Odometer Issues
    if any(keyword in val for keyword in ['mileage', 'odometer', 'mechanical limits']):
        return 'Mileage Issue'
        
    # 4. Lemon Law / Manufacturer Buyback
    if 'buyback' in val:
        return 'Buyback'
        
    # 5. Alternate Documentation (Bill of sale, Registered only, etc.)
    if val in ['bill of sale', 'no title', 'registered']:
        return 'Alternate Doc'
        
    # Catch-all for anything missed
    return 'Other'

# def clean_transmission_type(x):
#     if pd.isna(x):
#         return x
#     return x.split('(')[0].strip()

# def extract_gears(x):
#     if pd.isna(x):
#         return None
#     match = re.search(r'\((\d+)-Speed\)', x)
#     if match:
#         return int(match.group(1))
#     return None

def clean_transmission_type(x):
    if pd.isna(x):
        return "Unknown"
        
    # Split on parenthesis and strip trailing spaces
    val = str(x).split('(')[0].strip()
    
    # Strictly enforce valid types to filter out the charity auction paragraphs
    if val in ['Automatic', 'Manual']:
        return val
        
    return 'Other'

def extract_gears(x):
    if pd.isna(x):
        return None
        
    x_str = str(x)
    
    # 1. Improved Regex: 
    # (?i) makes it case-insensitive (catches "speed" and "Speed")
    # [-\s] allows either a hyphen or a space
    match = re.search(r'(?i)(\d+)[-\s]speed', x_str)
    
    if match:
        return float(match.group(1))
        
    # 2. Handle CVTs explicitly (usually coded as 1 gear for ML purposes)
    if 'CVT' in x_str.upper():
        return 1.0
        
    return None

# def extract_engine_info(x):
#     if pd.isna(x):
#         return None, None
#     disp = re.search(r'(\d+\.\d+)L', x)
#     cyl = re.search(r'([V|I|H|W]\d+)', x)
#     d_val = float(disp.group(1)) if disp else None
#     c_val = cyl.group(1) if cyl else None
#     return d_val, c_val

def extract_engine_info(x):
    # 1. Handle entirely missing/blank values
    if pd.isna(x) or str(x).strip() == '':
        return None, "Unknown"  # None for numeric, "Unknown" for categorical
    
    # --- Fix Displacement ---
    disp_l = re.search(r'(\d+\.?\d*)\s*L', x, re.IGNORECASE)
    disp_cc = re.search(r'(\d+)\s*cc', x, re.IGNORECASE)
    disp_ci = re.search(r'(\d+)\s*ci', x, re.IGNORECASE)
    
    # Default to None for missing numeric values
    d_val = None 
    if disp_l:
        d_val = float(disp_l.group(1))
    elif disp_cc:
        d_val = round(float(disp_cc.group(1)) / 1000.0, 1) 
    elif disp_ci:
        d_val = round(float(disp_ci.group(1)) / 61.0237, 1) 
        
    # --- Fix Cylinders ---
    c_val = "Other" 
    
    cyl = re.search(r'([VIW])[- ]?(\d+)', x, re.IGNORECASE)
    flat = re.search(r'Flat[- ]?(\d+)', x, re.IGNORECASE)
    inline = re.search(r'Inline[- ]?(\d+)', x, re.IGNORECASE)
    l_typo = re.search(r'(l)(\d+)', x)
    
    if cyl:
        c_val = cyl.group(1).upper() + cyl.group(2)
    elif inline:
        c_val = 'I' + inline.group(1)
    elif flat:
        c_val = 'H' + flat.group(1) 
    elif l_typo:
        c_val = 'I' + l_typo.group(2)
    elif 'Rotary' in x:
        c_val = 'Rotary'
    elif 'Electric' in x or 'Motor' in x:
        c_val = 'Electric'
        
    return d_val, c_val

def get_main_color(x):
    # 1. Catch missing values and group them into "Other"
    if pd.isna(x):
        return "Other"
        
    # 2. Grab the primary color before slashes or " and "
    x = str(x).split('/')[0].split(' and ')[0].strip()
    x_lower = x.lower()
    
    # 3. Check for specific keywords FIRST to prevent substring collisions 
    special_map = {
        # Edge cases, Collisions & Exterior Bleed-over
        'titanium': 'Gray', 'titan': 'Black', 'mustang': 'Brown', 'tanzanite': 'Blue',
        'stainless': 'Silver', 'mercury': 'Silver', 'magnetic': 'Gray', 'thunder': 'Gray',
        
        # Blacks / Darks
        'ebony': 'Black', 'nero': 'Black', 'carbon': 'Black', 'onyx': 'Black', 
        'jet': 'Black', 'obsidian': 'Black', 'beluga': 'Black', 'panther': 'Black',
        'amido': 'Black', 'midnight': 'Black', 'anthracite': 'Gray', 'zebra': 'Black',
        
        # Grays / Silvers
        'granite': 'Gray', 'charcoal': 'Gray', 'graphite': 'Gray', 'slate': 'Gray', 
        'ash': 'Gray', 'agate': 'Gray', 'stone': 'Gray', 'shale': 'Gray', 
        'platinum': 'Gray', 'pewter': 'Gray', 'palladium': 'Gray', 'meteor': 'Gray',
        'flint': 'Gray', 'ocean': 'Gray',
        
        # Whites / Lights
        'chalk': 'White', 'ivory': 'White', 'pearl': 'White', 'porcelain': 'White', 
        'alabaster': 'White', 'bianco': 'White', 'magnolia': 'White', 'oyster': 'White',
        'ice': 'White', 'ceramic': 'White',
        
        # Beiges / Tans / Browns
        'parchment': 'Beige', 'linen': 'Beige', 'cream': 'Beige', 'ecru': 'Beige', 
        'luxor': 'Beige', 'cashmere': 'Beige', 'savanna': 'Beige', 'almond': 'Beige', 
        'bamboo': 'Beige', 'wheat': 'Beige', 'champagne': 'Beige', 'kalahari': 'Beige', 
        'gobi': 'Beige', 'macchiato': 'Beige', 'taupe': 'Beige', 'sand': 'Beige', 
        'dune': 'Beige', 'saddle': 'Brown', 'oak': 'Brown', 'cocoa': 'Brown', 
        'cognac': 'Brown', 'caramel': 'Brown', 'cuoio': 'Brown', 'cinnamon': 'Brown', 
        'java': 'Brown', 'havanna': 'Brown', 'havana': 'Brown', 'mocha': 'Brown', 
        'espresso': 'Brown', 'nougat': 'Brown', 'chestnut': 'Brown', 'amaro': 'Brown', 
        'sepia': 'Brown', 'truffle': 'Brown', 'walnut': 'Brown', 'tartufo': 'Brown', 
        'terra': 'Brown', 'natural': 'Brown', 'palomino': 'Tan', 'camel': 'Tan', 
        'khaki': 'Tan', 'atacama': 'Tan',
        
        # Reds / Oranges
        'salsa': 'Red', 'coral': 'Red', 'imola': 'Red', 'fox': 'Red', 
        'burgundy': 'Red', 'magma': 'Red', 'carrera': 'Red', 'maroon': 'Red', 
        'chateau': 'Red', 'bordeaux': 'Red', 'fiona': 'Red', 'scarlet': 'Red', 
        'garnet': 'Red', 'crimson': 'Red', 'ruby': 'Red', 'cabernet': 'Red', 
        'rosso': 'Red', 'sakhir': 'Orange', 'kyalami': 'Orange',
        
        # Greens / Blues
        'jade': 'Green', 'cypress': 'Green', 'forest': 'Green', 'nordkap': 'Blue', 
        'nautic': 'Blue', 'yachting': 'Blue', 'estoril': 'Blue', 'marina': 'Blue'
    }
    
    for key, val in special_map.items():
        if key in x_lower:
            return val

    # 4. Check standard baseline colors
    std_colors = [
        'black', 'white', 'gray', 'grey', 'silver', 'red', 'blue', 
        'green', 'brown', 'beige', 'yellow', 'orange', 'gold', 'purple', 'tan'
    ]
    
    for color in std_colors:
        if color in x_lower:
            return 'Gray' if color == 'grey' else color.capitalize()
            
    # 5. Everything else becomes "Other"
    return "Other"

def clean_seller_type(x):
    if pd.isna(x):
        return "Unknown"
        
    val = str(x)
    
    # Consolidate all Dealer types (ignores doc fees, etc.)
    if 'Dealer' in val:
        return 'Dealer'
        
    # Consolidate all Private Party types (ignores liens, temporary tags, \n, etc.)
    elif 'Private Party' in val:
        return 'Private Party'

    else:
        return 'Other'



# --- Main Execution ---

# 1. Load Data
df = pd.read_csv("../data/cars_and_bids_full_history_v3.csv") # Replace with your new file name

# 2. Clean Target Variable (Price)
df['Sold_Price'] = df['Sold_Price'].apply(clean_currency)

# CRITICAL STEP: This will now drop the rows where 'Sold_Price' became None (e.g. the "Porsche" row)
df = df.dropna(subset=['Sold_Price'])

# 3. Clean Numerical Features
df['Mileage'] = df['Mileage'].apply(clean_mileage)

# 4. Clean Text Features
df['Model'] = df['Model'].apply(clean_model)
df['Title Status'] = df['Title Status'].apply(clean_and_group_title)
df['Seller Type'] = df['Seller Type'].apply(clean_seller_type)

# 5. Apply Color Cleaning
df['Exterior Color'] = df['Exterior Color'].apply(get_main_color)
df['Interior Color'] = df['Interior Color'].apply(get_main_color)

# 6. Feature Engineering
df['Transmission_Type'] = df['Transmission'].apply(clean_transmission_type)
df['Gears'] = df['Transmission'].apply(extract_gears)

engine_data = df['Engine'].apply(extract_engine_info)
df['Engine_Displacement_L'] = [x[0] for x in engine_data]
df['Engine_Cylinders'] = [x[1] for x in engine_data]

# APPLY TEXT FEATURE ENGINEERING
df = engineer_sharp_features(df)
df = engineer_date_features(df, is_inference=False)
df = df.dropna(subset=['auction_year'])

# Specify only the continuous numeric columns
cols_to_impute = ['Bids', 'Views', 'Gears', 'Mileage', 'Watchers', 'Engine_Displacement_L', 'Gears', 'car_age']

# Fill NaNs with the mean for just these columns
for col in cols_to_impute:
    df[col] = df[col].fillna(df[col].mean())

# 7. Drop Unused Columns
cols_to_drop = ['URL', 'Transmission', 'Engine', 'Auction_Date', 'VIN', 'Location', 'Highlights', 'Equipment', 'Modifications', 'Known Flaws', 'Recent Service History', 'Ownership History', 'Other Items Included in Sale', 'Seller Notes'] 
df_cleaned = df.drop(columns=cols_to_drop)
pd.set_option('display.max_columns', None)
df_cleaned.head()



Unnamed: 0,Sold_Price,Bids,Views,Watchers,Make,Model,Mileage,Title Status,Seller Type,Drivetrain,Body Style,Exterior Color,Interior Color,Transmission_Type,Gears,Engine_Displacement_L,Engine_Cylinders,2_keys_ind,is_dry_climate_car,is_project_car,has_new_tires,has_sport_seats,emissions_ind,loan_ind,one_owner_ind,carfax_ind,flaw_severity_score,recent_major_service,mod_status,auction_year,auction_month,car_age
0,226000.0,21.0,19077.0,538.5,Mercedes-Benz,G Wagen,60.0,Clean,Private Party,4WD/AWD,SUV/Crossover,Blue,Gray,Automatic,9.0,4.0,V8,1,0,0,0,0,0,1,0,0,0,1,unknown_mod,2026,2,0.0
1,76500.0,11.0,9834.0,538.5,BMW,G8X M4,2300.0,Clean,Private Party,Rear-wheel drive,Coupe,Blue,Gray,Manual,6.0,3.0,I6,1,0,0,0,0,0,1,0,0,0,1,unknown_mod,2026,2,0.0
2,215485.0,14.0,12902.0,538.5,Mercedes-Benz,G Wagen,300.0,Clean,Private Party,4WD/AWD,SUV/Crossover,White,Red,Automatic,9.0,4.0,V8,1,0,0,0,0,0,1,0,0,0,1,unknown_mod,2026,1,0.0
3,356000.0,52.0,40939.0,538.5,Porsche,992 911,70.0,Clean,Dealer,Rear-wheel drive,Coupe,Gray,Black,Manual,6.0,4.0,H6,1,0,0,0,0,0,0,0,0,0,1,unknown_mod,2026,1,0.0
7,66000.0,20.0,8266.0,538.5,Rivian,R1S,2300.0,Clean,Private Party,4WD/AWD,SUV/Crossover,Silver,Black,Automatic,5.716151,3.692453,Electric,0,0,0,0,0,0,1,0,0,0,1,unknown_mod,2025,12,0.0


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. Load the cleaned data
df = df_cleaned.copy()

# 3. Define Encoding Strategy
# Low cardinality columns -> One-Hot Encoding
one_hot_cols = [
    'Title Status', 
    'Seller Type', 
    'Drivetrain', 
    'Transmission_Type', 
    'Body Style', 
    'Engine_Cylinders',
    'mod_status',
    'auction_month'
]

# High cardinality columns -> Label Encoding
label_cols = [
    'Make', 
    'Exterior Color', 
    'Interior Color'
]

# NOTE: 'Model' is removed from label_cols because we will Target Encode it.

# 4. Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=one_hot_cols, drop_first=False)

# 5. Apply Label Encoding
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# 6. Apply Target Encoding for 'Model'
# Calculate the mean 'Sold_Price' for each 'Model'
model_target_map = df_encoded.groupby('Model')['Sold_Price'].mean()

# Map the means to the 'Model' column
df_encoded['Model_Target_Encoded'] = df_encoded['Model'].map(model_target_map)

# Drop the original categorical 'Model' column
df_encoded = df_encoded.drop(columns=['Model'])

# 7. Final Inspection
print("New Data Shape:", df_encoded.shape)
print(df_encoded[['Sold_Price', 'Model_Target_Encoded']].head())

# # Optional: Save for modeling
df_encoded.to_csv('../data/full_training_data.csv', index=False)
df_encoded

New Data Shape: (30774, 82)
   Sold_Price  Model_Target_Encoded
0    226000.0          70781.803653
1     76500.0          82453.200000
2    215485.0          70781.803653
3    356000.0         176864.934066
7     66000.0          86484.875000


Unnamed: 0,Sold_Price,Bids,Views,Watchers,Make,Mileage,Exterior Color,Interior Color,Gears,Engine_Displacement_L,2_keys_ind,is_dry_climate_car,is_project_car,has_new_tires,has_sport_seats,emissions_ind,loan_ind,one_owner_ind,carfax_ind,flaw_severity_score,recent_major_service,auction_year,car_age,Title Status_Alternate Doc,Title Status_Buyback,Title Status_Clean,Title Status_Mileage Issue,Title Status_Other,Title Status_Rebuilt/Salvage,Title Status_Unknown,Seller Type_Dealer,Seller Type_Private Party,Drivetrain_4WD/AWD,Drivetrain_Front-wheel drive,Drivetrain_Rear-wheel drive,Transmission_Type_Automatic,Transmission_Type_Manual,Body Style_Convertible,Body Style_Coupe,Body Style_Hatchback,Body Style_SUV/Crossover,Body Style_Sedan,Body Style_Truck,Body Style_Van/Minivan,Body Style_Wagon,Engine_Cylinders_Electric,Engine_Cylinders_H12,Engine_Cylinders_H2,Engine_Cylinders_H4,Engine_Cylinders_H6,Engine_Cylinders_I2,Engine_Cylinders_I3,Engine_Cylinders_I4,Engine_Cylinders_I5,Engine_Cylinders_I6,Engine_Cylinders_I8,Engine_Cylinders_Other,Engine_Cylinders_Rotary,Engine_Cylinders_Unknown,Engine_Cylinders_V10,Engine_Cylinders_V12,Engine_Cylinders_V2,Engine_Cylinders_V6,Engine_Cylinders_V8,Engine_Cylinders_W12,Engine_Cylinders_W8,mod_status_heavy_mod,mod_status_light_mod,mod_status_unknown_mod,auction_month_1,auction_month_2,auction_month_3,auction_month_4,auction_month_5,auction_month_6,auction_month_7,auction_month_8,auction_month_9,auction_month_10,auction_month_11,auction_month_12,Model_Target_Encoded
0,226000.0,21.0,19077.0,538.5,94,60.000000,2,5,9.000000,4.000000,1,0,0,0,0,0,1,0,0,0,1,2026,0.0,False,False,True,False,False,False,False,False,True,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,70781.803653
1,76500.0,11.0,9834.0,538.5,18,2300.000000,2,5,6.000000,3.000000,1,0,0,0,0,0,1,0,0,0,1,2026,0.0,False,False,True,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,82453.200000
2,215485.0,14.0,12902.0,538.5,94,300.000000,13,10,9.000000,4.000000,1,0,0,0,0,0,1,0,0,0,1,2026,0.0,False,False,True,False,False,False,False,False,True,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,70781.803653
3,356000.0,52.0,40939.0,538.5,116,70.000000,5,1,6.000000,4.000000,1,0,0,0,0,0,0,0,0,0,1,2026,0.0,False,False,True,False,False,False,False,True,False,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,176864.934066
7,66000.0,20.0,8266.0,538.5,125,2300.000000,11,1,5.716151,3.692453,0,0,0,0,0,0,1,0,0,0,1,2025,0.0,False,False,True,False,False,False,False,False,True,True,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,86484.875000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31132,76500.0,53.0,17772.0,538.5,9,75779.110192,2,1,3.000000,5.200000,0,0,0,0,0,1,0,0,0,0,1,2025,93.0,False,False,True,False,False,False,False,True,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,76500.000000
31133,6500.0,23.0,12708.0,538.5,49,75779.110192,11,5,3.000000,5.700000,1,0,0,0,0,1,0,0,0,0,1,2025,95.0,False,False,True,False,False,False,False,True,False,False,False,True,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,6500.000000
31134,26600.0,33.0,11921.0,538.5,22,75779.110192,2,10,4.000000,2.000000,0,0,0,0,0,1,0,0,0,0,1,2025,0.0,False,False,True,False,False,False,False,False,True,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,45143.961538
31135,11135.0,54.0,9329.0,538.5,22,75779.110192,14,3,4.000000,1.300000,0,1,0,0,0,1,0,0,0,0,1,2025,0.0,False,False,True,False,False,False,False,False,True,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,45143.961538


In [5]:
artifacts = {
    "label_encoders": label_encoders, # The dictionary of fitted LabelEncoders
    "target_encoder_map": model_target_map, # The Series mapping Model -> Price
}

with open('../deployment/encoding_artifacts_002.pkl', 'wb') as f:
    pickle.dump(artifacts, f)