In [2]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import pickle
import sys
import os

# Add the deployment folder to the system path so we can import utils
sys.path.append(os.path.abspath('../deployment'))

from utils import engineer_sharp_features, engineer_date_features

def clean_currency(x):
    """
    Converts '$8,400' string to 8400.0 float.
    NOW ROBUST: Returns None if it encounters text like 'Porsche'.
    """
    if pd.isna(x):
        return None
    if isinstance(x, str):
        # Remove symbols
        x_clean = x.replace('$', '').replace(',', '').strip()
        try:
            return float(x_clean)
        except ValueError:
            # If we can't convert it (e.g. it's text), return None
            return None
    return float(x)

def clean_mileage(x):
    """
    Converts '53,700' string to 53700.0 float.
    NOW ROBUST: Returns None if it encounters text.
    """
    if pd.isna(x):
        return None
    if isinstance(x, str):
        x_clean = x.replace(',', '').strip()
        try:
            return float(x_clean)
        except ValueError:
            return None
    return float(x)

def clean_model(x):
    if pd.isna(x):
        return x
    return x.replace('\nSave', '').strip()

def clean_title(x):
    if pd.isna(x):
        return x
    return x.split('(')[0].strip()

def clean_transmission_type(x):
    if pd.isna(x):
        return x
    return x.split('(')[0].strip()

def extract_gears(x):
    if pd.isna(x):
        return None
    match = re.search(r'\((\d+)-Speed\)', x)
    if match:
        return int(match.group(1))
    return None

def extract_engine_info(x):
    if pd.isna(x):
        return None, None
    disp = re.search(r'(\d+\.\d+)L', x)
    cyl = re.search(r'([V|I|H|W]\d+)', x)
    d_val = float(disp.group(1)) if disp else None
    c_val = cyl.group(1) if cyl else None
    return d_val, c_val

def get_main_color(x):
    # 1. Catch missing values and group them into "Other"
    if pd.isna(x):
        return "Other"
        
    # 2. Grab the primary color before slashes or " and "
    x = str(x).split('/')[0].split(' and ')[0].strip()
    x_lower = x.lower()
    
    # 3. Check for specific keywords FIRST to prevent substring collisions 
    special_map = {
        # Edge cases, Collisions & Exterior Bleed-over
        'titanium': 'Gray', 'titan': 'Black', 'mustang': 'Brown', 'tanzanite': 'Blue',
        'stainless': 'Silver', 'mercury': 'Silver', 'magnetic': 'Gray', 'thunder': 'Gray',
        
        # Blacks / Darks
        'ebony': 'Black', 'nero': 'Black', 'carbon': 'Black', 'onyx': 'Black', 
        'jet': 'Black', 'obsidian': 'Black', 'beluga': 'Black', 'panther': 'Black',
        'amido': 'Black', 'midnight': 'Black', 'anthracite': 'Gray', 'zebra': 'Black',
        
        # Grays / Silvers
        'granite': 'Gray', 'charcoal': 'Gray', 'graphite': 'Gray', 'slate': 'Gray', 
        'ash': 'Gray', 'agate': 'Gray', 'stone': 'Gray', 'shale': 'Gray', 
        'platinum': 'Gray', 'pewter': 'Gray', 'palladium': 'Gray', 'meteor': 'Gray',
        'flint': 'Gray', 'ocean': 'Gray',
        
        # Whites / Lights
        'chalk': 'White', 'ivory': 'White', 'pearl': 'White', 'porcelain': 'White', 
        'alabaster': 'White', 'bianco': 'White', 'magnolia': 'White', 'oyster': 'White',
        'ice': 'White', 'ceramic': 'White',
        
        # Beiges / Tans / Browns
        'parchment': 'Beige', 'linen': 'Beige', 'cream': 'Beige', 'ecru': 'Beige', 
        'luxor': 'Beige', 'cashmere': 'Beige', 'savanna': 'Beige', 'almond': 'Beige', 
        'bamboo': 'Beige', 'wheat': 'Beige', 'champagne': 'Beige', 'kalahari': 'Beige', 
        'gobi': 'Beige', 'macchiato': 'Beige', 'taupe': 'Beige', 'sand': 'Beige', 
        'dune': 'Beige', 'saddle': 'Brown', 'oak': 'Brown', 'cocoa': 'Brown', 
        'cognac': 'Brown', 'caramel': 'Brown', 'cuoio': 'Brown', 'cinnamon': 'Brown', 
        'java': 'Brown', 'havanna': 'Brown', 'havana': 'Brown', 'mocha': 'Brown', 
        'espresso': 'Brown', 'nougat': 'Brown', 'chestnut': 'Brown', 'amaro': 'Brown', 
        'sepia': 'Brown', 'truffle': 'Brown', 'walnut': 'Brown', 'tartufo': 'Brown', 
        'terra': 'Brown', 'natural': 'Brown', 'palomino': 'Tan', 'camel': 'Tan', 
        'khaki': 'Tan', 'atacama': 'Tan',
        
        # Reds / Oranges
        'salsa': 'Red', 'coral': 'Red', 'imola': 'Red', 'fox': 'Red', 
        'burgundy': 'Red', 'magma': 'Red', 'carrera': 'Red', 'maroon': 'Red', 
        'chateau': 'Red', 'bordeaux': 'Red', 'fiona': 'Red', 'scarlet': 'Red', 
        'garnet': 'Red', 'crimson': 'Red', 'ruby': 'Red', 'cabernet': 'Red', 
        'rosso': 'Red', 'sakhir': 'Orange', 'kyalami': 'Orange',
        
        # Greens / Blues
        'jade': 'Green', 'cypress': 'Green', 'forest': 'Green', 'nordkap': 'Blue', 
        'nautic': 'Blue', 'yachting': 'Blue', 'estoril': 'Blue', 'marina': 'Blue'
    }
    
    for key, val in special_map.items():
        if key in x_lower:
            return val

    # 4. Check standard baseline colors
    std_colors = [
        'black', 'white', 'gray', 'grey', 'silver', 'red', 'blue', 
        'green', 'brown', 'beige', 'yellow', 'orange', 'gold', 'purple', 'tan'
    ]
    
    for color in std_colors:
        if color in x_lower:
            return 'Gray' if color == 'grey' else color.capitalize()
            
    # 5. Everything else becomes "Other"
    return "Other"

# --- Main Execution ---

# 1. Load Data
df = pd.read_csv("../data/cars_and_bids_full_history_v3.csv") # Replace with your new file name

# 2. Clean Target Variable (Price)
df['Sold_Price'] = df['Sold_Price'].apply(clean_currency)

# CRITICAL STEP: This will now drop the rows where 'Sold_Price' became None (e.g. the "Porsche" row)
df = df.dropna(subset=['Sold_Price'])

# 3. Clean Numerical Features
df['Mileage'] = df['Mileage'].apply(clean_mileage)

# 4. Clean Text Features
df['Model'] = df['Model'].apply(clean_model)
df['Title Status'] = df['Title Status'].apply(clean_title)

# 5. Apply Color Cleaning
df['Exterior Color'] = df['Exterior Color'].apply(get_main_color)
df['Interior Color'] = df['Interior Color'].apply(get_main_color)

# 6. Feature Engineering
df['Transmission_Type'] = df['Transmission'].apply(clean_transmission_type)
df['Gears'] = df['Transmission'].apply(extract_gears)

engine_data = df['Engine'].apply(extract_engine_info)
df['Engine_Displacement_L'] = [x[0] for x in engine_data]
df['Engine_Cylinders'] = [x[1] for x in engine_data]

# APPLY TEXT FEATURE ENGINEERING
df = engineer_sharp_features(df)
df = engineer_date_features(df, is_inference=False)
df = df.dropna(subset=['auction_year'])

# 7. Drop Unused Columns
cols_to_drop = ['URL', 'Transmission', 'Engine'] 
df_cleaned = df.drop(columns=cols_to_drop)
pd.set_option('display.max_columns', None)
df_cleaned.head()



Unnamed: 0,Sold_Price,Bids,Views,Watchers,Auction_Date,Make,Model,Mileage,Title Status,Seller Type,Drivetrain,Body Style,Exterior Color,Interior Color,Highlights,Equipment,Modifications,Known Flaws,Recent Service History,Ownership History,Other Items Included in Sale,Seller Notes,Transmission_Type,Gears,Engine_Displacement_L,Engine_Cylinders,is_single_owner,2_keys_ind,is_dry_climate_car,has_full_service_records,is_no_reserve,is_all_original,is_numbers_matching,is_project_car,has_new_tires,has_sport_seats,emissions_ind,loan_ind,one_owner_ind,carfax_ind,flaw_severity_score,recent_major_service,mod_status,auction_year,auction_month,car_age
0,66000.0,26,6277,447.0,"Feb 18, 2026 1:48 PM MST",Ford,F-150,17600.0,Clean,Dealer ($200 Document Fee),4WD/AWD,Truck,White,Black,this... is a 2017 ford shelby f-150 supercrew ...,"a window sticker is pictured in the gallery, a...",,some chips around the exterior scratches on th...,the attached carfax history report shows that ...,the selling dealer reportedly acquired this f-...,2 keys shelby key tag,the selling dealer charges a $200 documentatio...,Automatic,6.0,5.0,V8,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,unknown_mod,2026,2,9.0
1,51000.0,22,11947,962.0,"Feb 18, 2026 1:46 PM MST",Audi,R8,41000.0,Clean,Private Party,4WD/AWD,Coupe,Blue,Beige,"this... is a 2009 audi r8 coupe, finished in j...",a build sheet is provided in the photo gallery...,,scratches on front end curb rash on passenger-...,a service log and receipts are included in the...,the seller reportedly purchased this r8 in jun...,"2 key fobs, 1 valet key, 1 blank key owner's m...",the seller reports that paint protection film ...,Automatic,6.0,4.2,V8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,unknown_mod,2026,2,17.0
2,17000.0,49,10966,662.0,"Feb 18, 2026 1:38 PM MST",Toyota,Land Cruiser,310800.0,Clean,Dealer,4WD/AWD,SUV/Crossover,Green,Other,"this… is a 2004 toyota land cruiser, finished ...",a partial list of notable equipment reported b...,notable modifications reported by the selling ...,chips and scratches around the exterior wear o...,the attached carfax history report shows that ...,the selling dealer reports that they acquired ...,1 key owner's manuals,the selling dealer states that the windows hav...,Automatic,5.0,4.7,V8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,unknown_mod,2026,2,22.0
3,29888.0,43,16565,957.0,"Feb 18, 2026 1:36 PM MST",BMW,E60 M5,,Clean,Dealer,Rear-wheel drive,Sedan,Silver,Black,"this... is a 2007 bmw m5, finished in silverst...",a build sheet is provided in the photo gallery...,notable modifications reported by the selling ...,the attached carfax history report indicates t...,service documentation in the photo gallery ind...,the selling dealer reports that they acquired ...,1 key owner's manual factory idrive display,"due to the modifications performed to this m5,...",Manual,6.0,5.0,V10,0,0,0,0,0,0,0,0,0,0,1,0,0,1,45,1,heavy_mod,2026,2,19.0
4,29995.0,30,8501,585.0,"Feb 18, 2026 1:33 PM MST",Rover,Mini,,Clean,Private Party,Front-wheel drive,Hatchback,Green,Black,"this... is a 1998 rover mini, finished in brit...",a partial list of notable equipment reported b...,this mini has been modified with the john coop...,creases on seats wear on interior touch-points,the seller reports that the following maintena...,the seller reports that they purchased this mi...,2 keys owner's manual,"since this mini is an imported vehicle, it may...",Manual,4.0,1.3,I4,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,light_mod,2026,2,28.0


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. Load the cleaned data
df = df_cleaned.copy()

# 2. Pre-Encoding Cleanup
# Consolidate "Dealer ($477 Document Fee)" into just "Dealer"
df['Seller Type'] = df['Seller Type'].apply(lambda x: 'Dealer' if 'Dealer' in x else x)

# 3. Define Encoding Strategy
# Low cardinality columns -> One-Hot Encoding
one_hot_cols = [
    'Title Status', 
    'Seller Type', 
    'Drivetrain', 
    'Transmission_Type', 
    'Body Style', 
    'Engine_Cylinders',
    'mod_status'
]

# High cardinality columns -> Label Encoding
label_cols = [
    'Make', 
    'Exterior Color', 
    'Interior Color'
]

# NOTE: 'Model' is removed from label_cols because we will Target Encode it.

# 4. Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

# 5. Apply Label Encoding
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# 6. Apply Target Encoding for 'Model'
# Calculate the mean 'Sold_Price' for each 'Model'
model_target_map = df_encoded.groupby('Model')['Sold_Price'].mean()

# Map the means to the 'Model' column
df_encoded['Model_Target_Encoded'] = df_encoded['Model'].map(model_target_map)

# Drop the original categorical 'Model' column
df_encoded = df_encoded.drop(columns=['Model'])

# 7. Final Inspection
print("New Data Shape:", df_encoded.shape)
print(df_encoded[['Sold_Price', 'Model_Target_Encoded']].head())

# # Optional: Save for modeling
df_encoded.to_csv('../data/full_training_data.csv', index=False)
df_encoded

New Data Shape: (4319, 84)
   Sold_Price  Model_Target_Encoded
0     66000.0          43545.115385
1     51000.0         108084.769231
2     17000.0          20235.769231
3     29888.0          29194.666667
4     29995.0          29995.000000


Unnamed: 0,Sold_Price,Bids,Views,Watchers,Auction_Date,Make,Mileage,Exterior Color,Interior Color,Highlights,...,Engine_Cylinders_I6,Engine_Cylinders_V10,Engine_Cylinders_V12,Engine_Cylinders_V6,Engine_Cylinders_V8,Engine_Cylinders_W12,Engine_Cylinders_W8,mod_status_light_mod,mod_status_unknown_mod,Model_Target_Encoded
0,66000.0,26,6277,447.0,"Feb 18, 2026 1:48 PM MST",28,17600.0,13,1,this... is a 2017 ford shelby f-150 supercrew ...,...,False,False,False,False,True,False,False,False,True,43545.115385
1,51000.0,22,11947,962.0,"Feb 18, 2026 1:46 PM MST",5,41000.0,2,0,"this... is a 2009 audi r8 coupe, finished in j...",...,False,False,False,False,True,False,False,False,True,108084.769231
2,17000.0,49,10966,662.0,"Feb 18, 2026 1:38 PM MST",85,310800.0,6,8,"this… is a 2004 toyota land cruiser, finished ...",...,False,False,False,False,True,False,False,False,True,20235.769231
3,29888.0,43,16565,957.0,"Feb 18, 2026 1:36 PM MST",9,,11,1,"this... is a 2007 bmw m5, finished in silverst...",...,False,True,False,False,False,False,False,False,False,29194.666667
4,29995.0,30,8501,585.0,"Feb 18, 2026 1:33 PM MST",73,,6,1,"this... is a 1998 rover mini, finished in brit...",...,False,False,False,False,False,False,False,True,False,29995.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4346,10250.0,11,11182,,"Aug 28, 2025 11:21 AM MST",85,210400.0,1,1,this… is a 2012 toyota tundra trd rock warrior...,...,False,False,False,False,True,False,False,False,True,25895.000000
4347,39000.0,63,11503,,"Aug 28, 2025 11:15 AM MST",4,30100.0,11,12,this… is a 2009 aston martin v8 vantage roadst...,...,False,False,False,False,True,False,False,False,True,49512.062500
4348,14000.0,60,10348,,"Aug 28, 2025 11:15 AM MST",45,263300.0,8,8,"this... is a 2004 lexus gx 470, finished in bl...",...,False,False,False,False,True,False,False,True,False,19112.925926
4349,42001.0,34,4531,,"Aug 28, 2025 11:08 AM MST",40,16300.0,8,1,"this... is a 2024 jeep wrangler rubicon x 4xe,...",...,False,False,False,False,False,False,False,False,True,21137.777778


In [5]:
artifacts = {
    "label_encoders": label_encoders, # The dictionary of fitted LabelEncoders
    "target_encoder_map": model_target_map, # The Series mapping Model -> Price
}

with open('../deployment/encoding_artifacts_002.pkl', 'wb') as f:
    pickle.dump(artifacts, f)