# Data Cleaning and Preprocessing for `car_prices.csv`

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('car_prices.csv')

# Fill missing categorical values with 'unknown'
fill_unknown = ['make', 'model', 'trim', 'body', 'transmission', 'color', 'interior', 'seller']
df[fill_unknown] = df[fill_unknown].fillna('unknown')

# Fill numerical columns with median
df['condition'] = df['condition'].fillna(df['condition'].median())
df['odometer'] = df['odometer'].fillna(df['odometer'].median())
df['mmr'] = df['mmr'].fillna(df['mmr'].median())
df['sellingprice'] = df['sellingprice'].fillna(df['sellingprice'].median())

# Fill small gaps with mode
df['state'] = df['state'].fillna(df['state'].mode()[0])

# Convert 'saledate' to datetime
df['saledate'] = pd.to_datetime(df['saledate'], errors='coerce')
df['saledate'] = df['saledate'].fillna(method='ffill')

# Drop any remaining nulls just to be safe
df.dropna(inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Standardize text: lowercase, no spaces
text_columns = ['make', 'model', 'trim', 'body', 'transmission', 'color', 'interior']
for col in text_columns:
    df[col] = df[col].str.lower().str.strip()

# Rename columns to snake_case
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Save cleaned data
df.to_csv('car_prices_cleaned.csv', index=False)

# Final check
print("Cleaning complete. Missing values summary:\n")
print(df.isnull().sum())