In [1]:
# final model clean code
import pandas as pd
import numpy as np
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso

# 1. Load and Clean Data
df = pd.read_csv('train.csv')
# Remove the documented outliers
df = df.drop(df[(df['GrLivArea'] > 4000) & (df['SalePrice'] < 300000)].index)

# 2. Separate Target and Features
# CRITICAL: Log-transform the target
y = np.log1p(df['SalePrice'])
X = df.drop('SalePrice', axis=1)

# 3. Define the Pipeline (Same as before)
num_features = X.select_dtypes(include=['int64', 'float64']).columns
cat_features = X.select_dtypes(include=['object']).columns

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

# 4. Create and Train the Final Pipeline
# We use the Best Alpha we found: 0.0005
final_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Lasso(alpha=0.0005)) 
])

print("Training final model on full dataset...")
final_model.fit(X, y)

# 5. Calculate Default Values (The "Average" House)
# We need this to fill in the 75 features the user doesn't see
defaults = {}
for col in X.columns:
    if X[col].dtype == 'object':
        defaults[col] = X[col].mode()[0] # Most common text
    else:
        defaults[col] = X[col].median()  # Median number

# 6. Save BOTH files
joblib.dump(final_model, 'house_price_model.pkl')
joblib.dump(defaults, 'house_price_defaults.pkl')

print("SUCCESS!")
print("1. Model saved as 'house_price_model.pkl'")
print("2. Defaults saved as 'house_price_defaults.pkl'")

Training final model on full dataset...
SUCCESS!
1. Model saved as 'house_price_model.pkl'
2. Defaults saved as 'house_price_defaults.pkl'
