<a href="https://colab.research.google.com/github/molefimcm/my-colab-notebooks/blob/main/house_prices_advanced_regression_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# House Prices Project - Advanced Regression Techniques

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Input data files are available in the folder defined by variables "/content/drive/MyDrive/MAIN_DIR/MAIN_DIR/PROJECT_NAME/INPUT_SUBDIR"
from google.colab import drive
drive.mount('/content/drive')
import os
MAIN_DIR = 'Notebook_Data'
PROJECT_NAME = 'house_price_prediction'
INPUT_SUBDIR = 'input'
INPUT_DIR = os.path.join('/content/drive/MyDrive', MAIN_DIR, PROJECT_NAME, INPUT_SUBDIR)
OUTPUT_SUBDIR = 'output'
OUTPUT_DIR = os.path.join('/content/drive/MyDrive', MAIN_DIR, PROJECT_NAME, OUTPUT_SUBDIR)
# Now you can use INPUT_DIR to access your files
print(os.listdir(INPUT_DIR))

# Load the data
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))
print(f"Training set shape: {train.shape}")
print(f"Testing set shape: {test.shape}")

# Save the ID column for submission file
train_ID = train['Id']
test_ID = test['Id']

# Remove the ID column from the datasets
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

# Log transform the target for better model performance
y_train = np.log1p(train['SalePrice'])

# Combine train and test for preprocessing
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat([train.drop('SalePrice', axis=1), test])

# Handle missing values by feature type
# For categorical features with high missingness, create a 'None' category
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType',
           'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
           'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')

# For features where NA means 0
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2',
           'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

# For Lot Frontage, use median by neighborhood
all_data['LotFrontage'] = all_data.groupby('Neighborhood')['LotFrontage'].transform(
    lambda x: x.fillna(x.median()))

# Fill remaining numerical NA with median
numeric_cols = all_data.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    all_data[col] = all_data[col].fillna(all_data[col].median())

# Fill remaining categorical NA with mode
categorical_cols = all_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# Feature Engineering
# Create new features
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['TotalBath'] = all_data['FullBath'] + (0.5 * all_data['HalfBath']) + \
                        all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath'])
all_data['HasPool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['Has2ndFloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasGarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasBsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['HasFireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
all_data['YearsSinceBuilt'] = 2023 - all_data['YearBuilt']
all_data['YearsSinceRemod'] = 2023 - all_data['YearRemodAdd']
all_data['TotalPorchSF'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + \
                           all_data['3SsnPorch'] + all_data['ScreenPorch']

# Label encoding for ordinal features
ordinal_features = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
                   'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']

quality_mapping = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
for feature in ordinal_features:
    all_data[feature] = all_data[feature].map(quality_mapping)

# Fix skewed numerical features
numeric_feats = all_data.dtypes[all_data.dtypes != 'object'].index
skewed_feats = all_data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
high_skew = skewed_feats[skewed_feats > 0.5]

# Apply Box-Cox transform to highly skewed features
from scipy.special import boxcox1p
for feature in high_skew.index:
    all_data[feature] = boxcox1p(all_data[feature], 0.15)

# One-hot encode categorical features
all_data = pd.get_dummies(all_data)

# Splitting the data back to train and test
X_train = all_data[:ntrain]
X_test = all_data[ntrain:]

# Base Models for Model Averaging
models = {
    'Ridge': Ridge(alpha=10),
    'Lasso': Lasso(alpha=0.001),
    'ElasticNet': ElasticNet(alpha=0.001, l1_ratio=0.5),
    'GBR': GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=4,
                                    max_features='sqrt', min_samples_leaf=15,
                                    min_samples_split=10, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=4,
                               min_child_weight=0.5, gamma=0.9, subsample=0.8,
                               colsample_bytree=0.8, objective='reg:squarederror',
                               nthread=-1, scale_pos_weight=1, seed=42),
    'LightGBM': lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05,
                                 n_estimators=500, max_bin=255, bagging_fraction=0.8,
                                 bagging_freq=5, feature_fraction=0.8, feature_fraction_seed=42,
                                 bagging_seed=42, min_data_in_leaf=6, min_sum_hessian_in_leaf=11)
}

# Store the models' predictions
model_preds = {}

# Train each model and make predictions
for name, model in models.items():
    # Fit on all training data
    model.fit(X_train, y_train)

    # Predict on test set
    model_preds[name] = np.expm1(model.predict(X_test))  # Reverse log transformation
    print(f"{name} model trained and predictions made")

# Weighted averaging of models
ensemble_pred = np.zeros(ntest)
weights = {
    'Ridge': 0.15,
    'Lasso': 0.15,
    'ElasticNet': 0.1,
    'GBR': 0.2,
    'XGBoost': 0.2,
    'LightGBM': 0.2
}

for name, pred in model_preds.items():
    ensemble_pred += weights[name] * pred

# Create submission file
submission = pd.DataFrame({
    'Id': test_ID,
    'SalePrice': ensemble_pred
})

# Display the first few rows to verify format
print("\nSubmission file preview:")
print(submission.head())

# For Google Colab - download the file
from google.colab import files
# Save to CSV
submission.to_csv(os.path.join(OUTPUT_DIR, 'submission.csv'), index=False)
# Download it
files.download(os.path.join(OUTPUT_DIR, 'submission.csv'))
print("\nSubmission file created successfully with shape:", submission.shape)
