In [9]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#import the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"\nOriginal train shape: {train_df.shape}")
print(f"Original test shape: {test_df.shape}")




Original train shape: (1460, 81)
Original test shape: (1459, 80)


In [10]:
# Save target and IDs
y_train = train_df['SalePrice']
train_df = train_df.drop('SalePrice', axis=1)
test_ids = test_df['Id']

# Combine for preprocessing
all_data = pd.concat([train_df, test_df], axis=0, ignore_index=True)
print(f"Combined data shape: {all_data.shape}")

Combined data shape: (2919, 80)


In [11]:
print("\n" + "=" * 60)
print("HANDLING MISSING VALUES")
print("=" * 60)

# Categorical: NaN means "None"
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'MasVnrType']
for col in none_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna('None')

# Numerical: NaN means 0
zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 
             'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 
             'BsmtHalfBath', 'MasVnrArea']
for col in zero_cols:
    if col in all_data.columns:
        all_data[col] = all_data[col].fillna(0)

# LotFrontage: Fill with median by neighborhood
if 'LotFrontage' in all_data.columns:
    all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
        lambda x: x.fillna(x.median()))

# Fill remaining with mode
for col in all_data.columns:
    if all_data[col].isnull().sum() > 0:
        if all_data[col].dtype == 'object':
            all_data[col] = all_data[col].fillna(all_data[col].mode()[0])
        else:
            all_data[col] = all_data[col].fillna(all_data[col].median())

print(f"Missing values after cleaning: {all_data.isnull().sum().sum()}")


HANDLING MISSING VALUES
Missing values after cleaning: 0


In [12]:
print("\n" + "=" * 60)
print("FEATURE ENGINEERING")
print("=" * 60)

# Total square footage
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

# Total bathrooms
all_data['TotalBath'] = (all_data['FullBath'] + 0.5 * all_data['HalfBath'] + 
                          all_data['BsmtFullBath'] + 0.5 * all_data['BsmtHalfBath'])

# House age
all_data['HouseAge'] = all_data['YrSold'] - all_data['YearBuilt']
all_data['RemodAge'] = all_data['YrSold'] - all_data['YearRemodAdd']

# Binary features
all_data['HasPool'] = (all_data['PoolArea'] > 0).astype(int)
all_data['HasGarage'] = (all_data['GarageArea'] > 0).astype(int)
all_data['HasBsmt'] = (all_data['TotalBsmtSF'] > 0).astype(int)
all_data['HasFireplace'] = (all_data['Fireplaces'] > 0).astype(int)
all_data['Has2ndFloor'] = (all_data['2ndFlrSF'] > 0).astype(int)

print("Created features: TotalSF, TotalBath, HouseAge, RemodAge, and binary indicators")



FEATURE ENGINEERING
Created features: TotalSF, TotalBath, HouseAge, RemodAge, and binary indicators


In [13]:
print("\n" + "=" * 60)
print("ENCODING CATEGORICAL VARIABLES")
print("=" * 60)

# Drop Id column
all_data = all_data.drop('Id', axis=1)

# One-hot encode
categorical_cols = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=categorical_cols, drop_first=True)

print(f"Shape after encoding: {all_data.shape}")
print(f"Total features: {all_data.shape[1]}")


ENCODING CATEGORICAL VARIABLES
Shape after encoding: (2919, 268)
Total features: 268


In [14]:
train_processed = all_data[:len(train_df)]
test_processed = all_data[len(train_df):]

# Remove outliers
outliers = train_processed[(train_processed['GrLivArea'] > 4000)].index
if len(outliers) > 0:
    train_processed = train_processed.drop(outliers)
    y_train = y_train.drop(outliers)
    print(f"\nRemoved {len(outliers)} outliers")

# Prepare final datasets
X_train = train_processed
y_train = np.log1p(y_train)  # Log transform target for better performance

print(f"\nFinal train shape: {X_train.shape}")
print(f"Final test shape: {test_processed.shape}")



Removed 4 outliers

Final train shape: (1456, 268)
Final test shape: (1459, 268)


In [15]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings

print("\n" + "=" * 60)
print("SPLITTING DATA FOR VALIDATION")
print("=" * 60)

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(f"Training set: {X_tr.shape}")
print(f"Validation set: {X_val.shape}")



SPLITTING DATA FOR VALIDATION
Training set: (1164, 268)
Validation set: (292, 268)


In [16]:
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_processed)

# ============================================
# STEP 8: Train Multiple Models
# ============================================
print("\n" + "=" * 60)
print("TRAINING MODELS")
print("=" * 60)

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=10),
    'Lasso Regression': Lasso(alpha=0.001, max_iter=10000),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_tr_scaled, y_tr)
    
    # Predict
    y_pred_val = model.predict(X_val_scaled)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
    r2 = r2_score(y_val, y_pred_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    
    results[name] = {'RMSE': rmse, 'R2': r2, 'MAE': mae, 'model': model}
    
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R² Score: {r2:.4f}")
    print(f"  MAE: {mae:.4f}")


TRAINING MODELS

Training Linear Regression...
  RMSE: 0.1300
  R² Score: 0.8930
  MAE: 0.0881

Training Ridge Regression...
  RMSE: 0.1293
  R² Score: 0.8942
  MAE: 0.0874

Training Lasso Regression...
  RMSE: 0.1256
  R² Score: 0.9002
  MAE: 0.0847

Training Random Forest...
  RMSE: 0.1442
  R² Score: 0.8684
  MAE: 0.0970

Training Gradient Boosting...
  RMSE: 0.1339
  R² Score: 0.8864
  MAE: 0.0924


In [17]:
print("\n" + "=" * 60)
print("MODEL COMPARISON")
print("=" * 60)

results_df = pd.DataFrame({
    'Model': list(results.keys()),
    'RMSE': [results[m]['RMSE'] for m in results.keys()],
    'R²': [results[m]['R2'] for m in results.keys()],
    'MAE': [results[m]['MAE'] for m in results.keys()]
})
results_df = results_df.sort_values('RMSE')
print(results_df.to_string(index=False))

best_model_name = results_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"\n🏆 Best Model: {best_model_name}")


MODEL COMPARISON
            Model     RMSE       R²      MAE
 Lasso Regression 0.125552 0.900193 0.084727
 Ridge Regression 0.129287 0.894166 0.087409
Linear Regression 0.129995 0.893005 0.088145
Gradient Boosting 0.133921 0.886444 0.092401
    Random Forest 0.144190 0.868362 0.097021

🏆 Best Model: Lasso Regression


In [18]:
print("\n" + "=" * 60)
print("MAKING PREDICTIONS")
print("=" * 60)

# Retrain on full training data
best_model.fit(scaler.fit_transform(X_train), y_train)

# Predict on test set
test_predictions = best_model.predict(X_test_scaled)
test_predictions = np.expm1(test_predictions) 


MAKING PREDICTIONS


In [19]:
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})

submission.to_csv('submission.csv', index=False)
print("\n✅ Predictions saved to 'submission.csv'")

# Show sample predictions
print("\nSample predictions:")
print(submission.head(10))

print("\n" + "=" * 60)
print("MODEL BUILDING COMPLETE!")
print("=" * 60)
print(f"Best Model: {best_model_name}")
print(f"Best RMSE: {results[best_model_name]['RMSE']:.4f}")
print(f"Best R² Score: {results[best_model_name]['R2']:.4f}")
print("\nSubmission file ready for Kaggle!")


✅ Predictions saved to 'submission.csv'

Sample predictions:
     Id      SalePrice
0  1461  119832.961635
1  1462  154393.364686
2  1463  180584.702633
3  1464  199502.582029
4  1465  194821.135555
5  1466  171597.223133
6  1467  180924.475161
7  1468  163755.070880
8  1469  195668.725512
9  1470  119001.910842

MODEL BUILDING COMPLETE!
Best Model: Lasso Regression
Best RMSE: 0.1256
Best R² Score: 0.9002

Submission file ready for Kaggle!


In [20]:
# ============================================
# SAVE THE TRAINED MODEL
# ============================================
print("\n" + "=" * 60)
print("SAVING THE MODEL")
print("=" * 60)

import joblib
import os

# Create a model package with everything needed for predictions
model_package = {
    'best_model': best_model,
    'scaler': scaler,
    'feature_names': list(X_train.columns),
    'preprocessing_info': {
        'target_log_transformed': True,  # Remember we used log transform
        'outliers_removed': True,
        'training_date': pd.Timestamp.now().isoformat()
    },
    'performance': {
        'best_model_name': best_model_name,
        'rmse': results[best_model_name]['RMSE'],
        'r2_score': results[best_model_name]['R2']
    }
}

# Save the complete package
joblib.dump(model_package, 'sa_house_price_model.joblib')
print("✅ Model saved as 'sa_house_price_model.joblib'")

# Also save just the model separately for quick access
joblib.dump(best_model, 'best_model_only.joblib')
print("✅ Best model saved separately as 'best_model_only.joblib'")

# Print model details
print(f"\n📊 Model Details:")
print(f"   - Model Type: {type(best_model).__name__}")
print(f"   - Features: {len(X_train.columns)}")
print(f"   - Training Samples: {X_train.shape[0]}")
print(f"   - Validation RMSE: {results[best_model_name]['RMSE']:.4f}")
print(f"   - Validation R²: {results[best_model_name]['R2']:.4f}")

# Verify the save worked
try:
    loaded_package = joblib.load('sa_house_price_model.joblib')
    print("✅ Model verification: Load successful!")
    print(f"✅ Package contains: {list(loaded_package.keys())}")
except Exception as e:
    print(f"❌ Error verifying model: {e}")


SAVING THE MODEL
✅ Model saved as 'sa_house_price_model.joblib'
✅ Best model saved separately as 'best_model_only.joblib'

📊 Model Details:
   - Model Type: Lasso
   - Features: 268
   - Training Samples: 1456
   - Validation RMSE: 0.1256
   - Validation R²: 0.9002
✅ Model verification: Load successful!
✅ Package contains: ['best_model', 'scaler', 'feature_names', 'preprocessing_info', 'performance']
