In [1]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import pickle
import sys
import os

# Add the deployment folder to the system path so we can import utils
sys.path.append(os.path.abspath('../deployment'))

from utils import engineer_sharp_features, engineer_date_features

def clean_currency(x):
    """
    Converts '$8,400' string to 8400.0 float.
    NOW ROBUST: Returns None if it encounters text like 'Porsche'.
    """
    if pd.isna(x):
        return None
    if isinstance(x, str):
        # Remove symbols
        x_clean = x.replace('$', '').replace(',', '').strip()
        try:
            return float(x_clean)
        except ValueError:
            # If we can't convert it (e.g. it's text), return None
            return None
    return float(x)

def clean_mileage(x):
    """
    Converts '53,700' string to 53700.0 float.
    NOW ROBUST: Returns None if it encounters text.
    """
    if pd.isna(x):
        return None
    if isinstance(x, str):
        x_clean = x.replace(',', '').strip()
        try:
            return float(x_clean)
        except ValueError:
            return None
    return float(x)

def clean_model(x):
    if pd.isna(x):
        return x
    return x.replace('\nSave', '').strip()

def clean_title(x):
    if pd.isna(x):
        return x
    return x.split('(')[0].strip()

def clean_transmission_type(x):
    if pd.isna(x):
        return x
    return x.split('(')[0].strip()

def extract_gears(x):
    if pd.isna(x):
        return None
    match = re.search(r'\((\d+)-Speed\)', x)
    if match:
        return int(match.group(1))
    return None

def extract_engine_info(x):
    if pd.isna(x):
        return None, None
    disp = re.search(r'(\d+\.\d+)L', x)
    cyl = re.search(r'([V|I|H|W]\d+)', x)
    d_val = float(disp.group(1)) if disp else None
    c_val = cyl.group(1) if cyl else None
    return d_val, c_val

def get_main_color(x):
    if pd.isna(x):
        return "Unknown"
    
    x = x.split('/')[0].strip()
    x_lower = x.lower()
    
    special_map = {
        'bianco': 'White', 'salsa': 'Red', 'granite': 'Gray', 'anthracite': 'Gray',
        'carbon': 'Black', 'jade': 'Green', 'cypress': 'Green', 'ebony': 'Black',
        'linen': 'Beige', 'cream': 'Beige', 'macchiato': 'Brown', 'charcoal': 'Gray',
        'graphite': 'Gray', 'slate': 'Gray', 'chalk': 'White', 'cocoa': 'Brown'
    }
    
    for key, val in special_map.items():
        if key in x_lower:
            return val

    std_colors = ['Black', 'White', 'Gray', 'Grey', 'Silver', 'Red', 'Blue', 
                  'Green', 'Brown', 'Beige', 'Yellow', 'Orange', 'Gold', 'Purple', 'Tan']
    
    for color in std_colors:
        if color.lower() in x_lower:
            return 'Gray' if color == 'Grey' else color
            
    return "Other"

# --- Main Execution ---

# 1. Load Data
df = pd.read_csv("../data/cars_and_bids_full_history.csv") # Replace with your new file name

# 2. Clean Target Variable (Price)
df['Sold_Price'] = df['Sold_Price'].apply(clean_currency)

# CRITICAL STEP: This will now drop the rows where 'Sold_Price' became None (e.g. the "Porsche" row)
df = df.dropna(subset=['Sold_Price'])

# 3. Clean Numerical Features
df['Mileage'] = df['Mileage'].apply(clean_mileage)

# 4. Clean Text Features
df['Model'] = df['Model'].apply(clean_model)
df['Title Status'] = df['Title Status'].apply(clean_title)

# 5. Apply Color Cleaning
df['Exterior Color'] = df['Exterior Color'].apply(get_main_color)
df['Interior Color'] = df['Interior Color'].apply(get_main_color)

# 6. Feature Engineering
df['Transmission_Type'] = df['Transmission'].apply(clean_transmission_type)
df['Gears'] = df['Transmission'].apply(extract_gears)

engine_data = df['Engine'].apply(extract_engine_info)
df['Engine_Displacement_L'] = [x[0] for x in engine_data]
df['Engine_Cylinders'] = [x[1] for x in engine_data]

# APPLY TEXT FEATURE ENGINEERING
df = engineer_sharp_features(df)

# 7. Drop Unused Columns
cols_to_drop = ['URL', 'Transmission', 'Engine'] 
df_cleaned = df.drop(columns=cols_to_drop)
df_cleaned.head()



Unnamed: 0,Sold_Price,Make,Model,Mileage,Title Status,Seller Type,Drivetrain,Body Style,Exterior Color,Interior Color,Transmission_Type,Gears,Engine_Displacement_L,Engine_Cylinders
0,356000.0,Porsche,992 911,70.0,Clean,Dealer,Rear-wheel drive,Coupe,Gray,Black,Manual,6.0,4.0,
1,12100.0,BMW,3 Series,138200.0,Clean,Private Party,Rear-wheel drive,Wagon,White,Black,Automatic,5.0,2.5,I6
2,42000.0,Toyota,Supra,21700.0,Clean,Private Party,Rear-wheel drive,Coupe,Black,Black,Automatic,8.0,3.0,I6
3,26250.0,BMW,F8X M4,91600.0,Clean,Dealer ($250 Doc Fee),Rear-wheel drive,Coupe,Gray,Orange,Manual,6.0,3.0,I6
4,58500.0,Ferrari,California,77800.0,Clean,Private Party,Rear-wheel drive,Convertible,White,Black,Automatic,7.0,4.3,V8


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. Load the cleaned data
df = df_cleaned.copy()

# 2. Pre-Encoding Cleanup
# Consolidate "Dealer ($477 Document Fee)" into just "Dealer"
df['Seller Type'] = df['Seller Type'].apply(lambda x: 'Dealer' if 'Dealer' in x else x)

# 3. Define Encoding Strategy
# Low cardinality columns -> One-Hot Encoding
one_hot_cols = [
    'Title Status', 
    'Seller Type', 
    'Drivetrain', 
    'Transmission_Type', 
    'Body Style', 
    'Engine_Cylinders',
    'mod_status'
]

# High cardinality columns -> Label Encoding
label_cols = [
    'Make', 
    'Exterior Color', 
    'Interior Color'
]

# NOTE: 'Model' is removed from label_cols because we will Target Encode it.

# 4. Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

# 5. Apply Label Encoding
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

# 6. Apply Target Encoding for 'Model'
# Calculate the mean 'Sold_Price' for each 'Model'
model_target_map = df_encoded.groupby('Model')['Sold_Price'].mean()

# Map the means to the 'Model' column
df_encoded['Model_Target_Encoded'] = df_encoded['Model'].map(model_target_map)

# Drop the original categorical 'Model' column
df_encoded = df_encoded.drop(columns=['Model'])

# 7. Final Inspection
print("New Data Shape:", df_encoded.shape)
print(df_encoded[['Sold_Price', 'Model_Target_Encoded']].head())

# # Optional: Save for modeling
df_encoded.to_csv('../data/full_training_data.csv', index=False)
df_encoded

New Data Shape: (2918, 45)
   Sold_Price  Model_Target_Encoded
0    356000.0         191562.875000
1     12100.0          14453.783133
2     42000.0          54650.833333
3     26250.0          39381.812500
4     58500.0          66166.666667


Unnamed: 0,Sold_Price,Make,Mileage,Exterior Color,Interior Color,Gears,Engine_Displacement_L,Title Status_Bill of Sale,Title Status_Clean,Title Status_Exceeds Mechanical Limits,...,Engine_Cylinders_I4,Engine_Cylinders_I5,Engine_Cylinders_I6,Engine_Cylinders_V10,Engine_Cylinders_V12,Engine_Cylinders_V6,Engine_Cylinders_V8,Engine_Cylinders_W12,Engine_Cylinders_W8,Model_Target_Encoded
0,356000.0,60,70.0,5,1,6.0,4.0,False,True,False,...,False,False,False,False,False,False,False,False,False,191562.875000
1,12100.0,8,138200.0,13,1,5.0,2.5,False,True,False,...,False,False,True,False,False,False,False,False,False,14453.783133
2,42000.0,76,21700.0,1,1,8.0,3.0,False,True,False,...,False,False,True,False,False,False,False,False,False,54650.833333
3,26250.0,8,91600.0,5,6,6.0,3.0,False,True,False,...,False,False,True,False,False,False,False,False,False,39381.812500
4,58500.0,24,77800.0,13,1,7.0,4.3,False,True,False,...,False,False,False,False,False,False,True,False,False,66166.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3295,60000.0,2,2300.0,5,1,8.0,2.9,False,True,False,...,False,False,False,False,False,True,False,False,False,38364.875000
3296,16250.0,30,18300.0,7,9,6.0,2.4,False,True,False,...,True,False,False,False,False,False,False,False,False,11090.909091
3297,14000.0,79,78500.0,1,7,6.0,1.8,False,True,False,...,True,False,False,False,False,False,False,False,False,12745.833333
3298,15000.0,8,93700.0,1,1,6.0,3.0,False,True,False,...,False,False,True,False,False,False,False,False,False,19582.080000


In [5]:
artifacts = {
    "label_encoders": label_encoders, # The dictionary of fitted LabelEncoders
    "target_encoder_map": model_target_map, # The Series mapping Model -> Price
}

with open('../deployment/encoding_artifacts_002.pkl', 'wb') as f:
    pickle.dump(artifacts, f)