In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

In [None]:
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train_df.columns

In [None]:
train_df.describe()

In [None]:
train_df.dtypes[train_df.dtypes != 'object']

In [None]:
plt.scatter(x='MSSubClass', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='LotFrontage', y='SalePrice', data=train_df)

In [None]:
train_df.query('LotFrontage > 300')
#Drop 935, 1299

In [None]:
plt.scatter(x='LotArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('LotArea > 55000')
# 250, 314, 336, 707
#maybe 1397

In [None]:
stats.zscore(train_df['LotArea']).sort_values().tail(10)

In [None]:
plt.scatter(x='OverallQual', y='SalePrice', data=train_df)

In [None]:
train_df.query('OverallQual == 10')
#maybe 524

In [None]:
plt.scatter(x='OverallCond', y='SalePrice', data=train_df)

In [None]:
train_df.query('OverallCond == 2')
#379

In [None]:
train_df.query('OverallCond == 5 & SalePrice > 700000')
#1183

In [None]:
train_df.query('OverallCond == 6 & SalePrice > 700000')
#692

In [None]:
plt.scatter(x='YearBuilt', y='SalePrice', data=train_df)

In [None]:
train_df.query('YearBuilt < 1900 & SalePrice > 400000')
#186

In [None]:
plt.scatter(x='YearRemodAdd', y='SalePrice', data=train_df)

In [None]:
train_df.query('YearRemodAdd < 1970 & SalePrice > 300000')
#314

In [None]:
plt.scatter(x='MasVnrArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('MasVnrArea > 1500')
#298

In [None]:
plt.scatter(x='BsmtFinSF1', y='SalePrice', data=train_df)

In [None]:
train_df.query('BsmtFinSF1 > 5000')
#1299

In [None]:
plt.scatter(x='BsmtFinSF2', y='SalePrice', data=train_df)

In [None]:
train_df.query('BsmtFinSF2 > 400 & SalePrice > 500000')
#441

In [None]:
plt.scatter(x='BsmtUnfSF', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='TotalBsmtSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('TotalBsmtSF > 5000')
#1299

In [None]:
plt.scatter(x='1stFlrSF', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='2ndFlrSF', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='LowQualFinSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('LowQualFinSF > 550')
#186

In [None]:
plt.scatter(x='GrLivArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('GrLivArea > 4400')
#524, 1299

In [None]:
plt.scatter(x='BsmtFullBath', y='SalePrice', data=train_df)

In [None]:
train_df.query('BsmtFullBath == 3')
#739

In [None]:
plt.scatter(x='BsmtHalfBath', y='SalePrice', data=train_df)

In [None]:
stats.zscore(train_df['BsmtHalfBath']).unique()

In [None]:
train_df.query('BsmtHalfBath == 2')
#598, 955

In [None]:
plt.scatter(x='FullBath', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='HalfBath', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='BedroomAbvGr', y='SalePrice', data=train_df)

In [None]:
train_df.query('BedroomAbvGr == 8')
#636

In [None]:
plt.scatter(x='KitchenAbvGr', y='SalePrice', data=train_df)

In [None]:
train_df.query('KitchenAbvGr == 3')
#49, 810

In [None]:
plt.scatter(x='TotRmsAbvGrd', y='SalePrice', data=train_df)

In [None]:
train_df.query('TotRmsAbvGrd == 14')
#636

In [None]:
plt.scatter(x='Fireplaces', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='GarageYrBlt', y='SalePrice', data=train_df)

In [None]:
train_df.query('GarageYrBlt > 1980 & SalePrice > 700000')
#692, 1183

In [None]:
plt.scatter(x='GarageCars', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='GarageArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('GarageArea > 1200')
#582, 826, 1062, 1191, 1299

In [None]:
plt.scatter(x='WoodDeckSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('WoodDeckSF >800')
#54

In [None]:
plt.scatter(x='OpenPorchSF', y='SalePrice', data=train_df)

In [None]:
train_df.query('OpenPorchSF > 500')
#496, 584, 1329

In [None]:
plt.scatter(x='EnclosedPorch', y='SalePrice', data=train_df)

In [None]:
train_df.query('EnclosedPorch > 500')
#198

In [None]:
plt.scatter(x='3SsnPorch', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='ScreenPorch', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='PoolArea', y='SalePrice', data=train_df)

In [None]:
train_df.query('PoolArea > 500 & SalePrice > 700000')
#1183

In [None]:
values = [54, 598, 955, 935, 1299, 250, 314, 336, 707, 379, 1183, 692, 186, 441, 186, 524, 739, 955, 636, 1062, 1191, 496, 198, 1338, 582, 826, 1191, 1299, 584, 1329]

In [None]:
train_df = train_df[train_df.Id.isin(values) == False]
#The resulting train_df would exclude rows with Id values in the values list, such as 198, 250, 314, 636, and 1299.

In [None]:
pd.DataFrame(train_df.isnull().sum().sort_values(ascending=False)).head(20)
#presents a summary of the top 20 columns with the most missing values.

In [None]:
train_df['MiscFeature'].unique()

In [None]:
train_df['Alley'].unique()

In [None]:
train_df['Alley'].fillna('No', inplace=True)
test_df['Alley'].fillna('No', inplace=True)

#fills missing values in the Alley column of the train_df DataFrame with the string 'No'.

In [None]:
sns.catplot(data=train_df, x="Alley", y="SalePrice", kind="box")

In [None]:
train_df['Fence'].unique()

In [None]:
train_df['Fence'].fillna('No', inplace=True)
test_df['Fence'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="Fence", y="SalePrice", kind="box")

In [None]:
train_df['MasVnrType'].unique()

In [None]:
train_df['MasVnrType'].fillna('No', inplace=True)
test_df['MasVnrType'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="MasVnrType", y="SalePrice", kind="box")

In [None]:
train_df['MasVnrArea'].fillna(0, inplace=True)
test_df['MasVnrArea'].fillna(0, inplace=True)

In [None]:
train_df['FireplaceQu'].unique()

In [None]:
train_df['FireplaceQu'].fillna('No', inplace=True)
test_df['FireplaceQu'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="FireplaceQu", y="SalePrice", kind="box")

In [None]:
sns.catplot(data=train_df, x="Fireplaces", y="SalePrice", kind="box")

In [None]:
train_df['LotFrontage'].fillna(0, inplace=True)
test_df['LotFrontage'].fillna(0, inplace=True)

In [None]:
train_df['GarageYrBlt'].corr(train_df['YearBuilt'])

In [None]:
train_df['GarageCond'].unique()

In [None]:
train_df['GarageCond'].fillna('No', inplace=True)
test_df['GarageCond'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="GarageCond", y="SalePrice", kind="box")

In [None]:
train_df['GarageType'].fillna('No', inplace=True)
test_df['GarageType'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="GarageType", y="SalePrice", kind="box")

In [None]:
train_df['GarageFinish'].fillna('No', inplace=True)
test_df['GarageFinish'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="GarageFinish", y="SalePrice", kind="box")

In [None]:
train_df['GarageQual'].fillna('No', inplace=True)
test_df['GarageQual'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="GarageQual", y="SalePrice", kind="box")

In [None]:
train_df['BsmtFinType2'].unique()

In [None]:
train_df['BsmtFinType2'].fillna('Unf', inplace=True)
test_df['BsmtFinType2'].fillna('Unf', inplace=True)

In [None]:
sns.catplot(data=train_df, x="BsmtFinType2", y="SalePrice", kind="box")

In [None]:
train_df['BsmtExposure'].unique()

In [None]:
train_df['BsmtExposure'].fillna('No', inplace=True)
test_df['BsmtExposure'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="BsmtExposure", y="SalePrice", kind="box")

In [None]:
train_df['BsmtQual'].unique()

In [None]:
train_df['BsmtQual'].fillna('No', inplace=True)
test_df['BsmtQual'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="BsmtQual", y="SalePrice", kind="box")

In [None]:
train_df['BsmtCond'].unique()

In [None]:
train_df['BsmtCond'].fillna('No', inplace=True)
test_df['BsmtCond'].fillna('No', inplace=True)

In [None]:
sns.catplot(data=train_df, x="BsmtCond", y="SalePrice", kind="box")

In [None]:
train_df['BsmtFinType1'].unique()

In [None]:
train_df['BsmtFinType1'].fillna('Unf', inplace=True)
test_df['BsmtFinType1'].fillna('Unf', inplace=True)

In [None]:
sns.catplot(data=train_df, x="BsmtFinType1", y="SalePrice", kind="box")

In [None]:
train_df['MasVnrArea'].fillna(0, inplace=True)
test_df['MasVnrArea'].fillna(0, inplace=True)

In [None]:
train_df['Electrical'].fillna('SBrkr', inplace=True)
test_df['Electrical'].fillna('SBrkr', inplace=True)

In [None]:
train_df = train_df.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'GarageYrBlt', 'GarageCond', 'BsmtFinType2'])
test_df = test_df.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'GarageYrBlt', 'GarageCond', 'BsmtFinType2'])

In [None]:
#feature engineering

In [None]:
train_df['houseage'] = train_df['YrSold'] - train_df['YearBuilt']
test_df['houseage'] = test_df['YrSold'] - test_df['YearBuilt']

In [None]:
train_df['houseremodelage'] = train_df['YrSold'] - train_df['YearRemodAdd']
test_df['houseremodelage'] = test_df['YrSold'] - test_df['YearRemodAdd']

In [None]:
train_df['totalsf'] = train_df['1stFlrSF'] + train_df['2ndFlrSF'] + train_df['BsmtFinSF1'] + train_df['BsmtFinSF2']
test_df['totalsf'] = test_df['1stFlrSF'] + test_df['2ndFlrSF'] + test_df['BsmtFinSF1'] + test_df['BsmtFinSF2']

In [None]:
train_df['totalarea'] = train_df['GrLivArea'] + train_df['TotalBsmtSF']
test_df['totalarea'] = test_df['GrLivArea'] + test_df['TotalBsmtSF']

In [None]:
train_df['totalbaths'] = train_df['BsmtFullBath'] + train_df['FullBath'] + 0.5 * (train_df['BsmtHalfBath'] + train_df['HalfBath']) 
test_df['totalbaths'] = test_df['BsmtFullBath'] + test_df['FullBath'] + 0.5 * (test_df['BsmtHalfBath'] + test_df['HalfBath']) 

In [None]:
train_df['totalporchsf'] = train_df['OpenPorchSF'] + train_df['3SsnPorch'] + train_df['EnclosedPorch'] + train_df['ScreenPorch'] + train_df['WoodDeckSF']
test_df['totalporchsf'] = test_df['OpenPorchSF'] + test_df['3SsnPorch'] + test_df['EnclosedPorch'] + test_df['ScreenPorch'] + test_df['WoodDeckSF']

In [None]:
train_df = train_df.drop(columns=['Id','YrSold', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'GrLivArea', 'TotalBsmtSF','BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath', 'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch','WoodDeckSF'])
test_df = test_df.drop(columns=['YrSold', 'YearBuilt', 'YearRemodAdd', '1stFlrSF', '2ndFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'GrLivArea', 'TotalBsmtSF','BsmtFullBath', 'FullBath', 'BsmtHalfBath', 'HalfBath', 'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch','WoodDeckSF'])

In [None]:
correlation_matrix = train_df.corr(numeric_only=True)
plt.figure(figsize=(20,12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

In [None]:
#drop GarageArea or GarageCars as they are highly correlated to each other

In [None]:
train_df = train_df.drop(columns=['GarageArea'])
test_df = test_df.drop(columns=['GarageArea'])

In [None]:
sns.histplot(
    train_df,
    x=train_df['SalePrice']
)

In [None]:
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])
#logs the SalePrice column of the train_df DataFrame, and stores the result back in the SalePrice column.

In [None]:
sns.histplot(
    train_df,
    x=train_df['SalePrice']
)

In [None]:
train_df.dtypes[train_df.dtypes=='object']

In [None]:
train_df.dtypes[train_df.dtypes !='object']

In [None]:
ode_cols = ['LotShape', 'LandContour','Utilities','LandSlope',  'BsmtQual',  'BsmtFinType1',  'CentralAir',  'Functional', \
           'FireplaceQu', 'GarageFinish', 'GarageQual', 'PavedDrive', 'ExterCond', 'KitchenQual', 'BsmtExposure', 'HeatingQC','ExterQual', 'BsmtCond']
#List of categorical columns to be processed with ordinal encoding.

In [None]:
ohe_cols = ['Street', 'LotConfig','Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', \
           'MasVnrType','Foundation',  'Electrical',  'SaleType', 'MSZoning', 'SaleCondition', 'Heating', 'GarageType', 'RoofMatl']
#List of categorical columns to be processed with one-hot encoding.

In [None]:
num_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
num_cols = num_cols.drop('SalePrice')
#List of numerical columns

In [None]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
#Purpose: Impute missing numerical values and standardize the data.
#SimpleImputer(strategy='mean'): Fills missing values with the mean.
#StandardScaler(): Standardizes features by removing the mean and scaling to unit variance

In [None]:
ode_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
#Purpose: Impute missing values and apply ordinal encoding to categorical data i.e. converting categorical variables into numerical values
#SimpleImputer(strategy='most_frequent'): Fills missing values with the most frequent value.
#OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1): Encodes categorical features as integers, with a placeholder for unknown values.

In [None]:
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
#Purpose: Impute missing values and apply one-hot encoding to categorical data i.e. convert categorical variables into a binary (0 or 1) format 
#SimpleImputer(strategy='most_frequent'): Fills missing values with the most frequent value.
#OneHotEncoder(handle_unknown='ignore', sparse_output=False): Creates one-hot encoded binary columns for categorical features, ignoring unknown values, and outputting a dense array.

In [None]:
col_trans = ColumnTransformer(transformers=[
    ('num_p', num_pipeline, num_cols),
    ('ode_p', ode_pipeline, ode_cols),
    ('ohe_p', ohe_pipeline, ohe_cols),
    ],
    remainder='passthrough', 
    n_jobs=-1)
#Purpose: Apply different preprocessing steps to different subsets of columns.
#e.g. num_p: Apply num_pipeline to numerical columns.
#remainder='passthrough': Keep columns not explicitly listed unchanged.
#n_jobs=-1: Utilize all processors for parallel execution.

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessing', col_trans)
])
#Purpose: Integrate the column transformer into a single pipeline.
#preprocessing: Apply col_trans to the data.

In [None]:
X = train_df.drop('SalePrice', axis=1)
#X: Features excluding SalePrice.
y = train_df['SalePrice']
#y: Target column SalePrice.

In [None]:
X_preprocessed = pipeline.fit_transform(X)
#pipeline.fit_transform(X): Applies the preprocessing steps defined in the pipeline to the features X.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=25)
#test_size=0.2: 20% of the data is used for testing.
#random_state=25: Ensures reproducibility of the split.

In [None]:
#build models

In [None]:
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)
#Trains the linear regression model on the training data.

In [None]:
y_pred_lr = lr.predict(X_test)
#Predicts house prices on the test set.

In [None]:
mean_squared_error(y_test, y_pred_lr)
#Computes the mean squared error between actual and predicted values.

Random Forest Intuition
1. Pick at random K data points from the Training set (i.e. Create a bootstrapped dataset)
2. Build the Decision Tree associated to these K data points (i.e. randomly select root nodes and 2 other child nodes > repeat until a tree is made)
3. Choose the number of trees you want to build Ntree and repeat STEPS 1 & 2
4. For a new data point, make each one of your Ntree trees predict the value of Y and assign the new data point the average across all of the predicted Y values

In [None]:
RFR = RandomForestRegressor(random_state=13)
#13 is just an id

In [1]:
param_grid_RFR = {
    'max_depth': [5, 10, 15],
    'n_estimators': [100, 250, 500],
    'min_samples_split': [3, 5, 10]
}
#Defines different sets of hyperparameters to search over 
#max_depth: Maximum depth of the trees in the forest. regressor will look for most optimal depth
#n_estimators: Number of trees in the forest.
#min_samples_split: Minimum number of samples required to split an internal node. (i.e., to create a new branch)
#Low Values: model can grow very deep trees with many splits, potentially capturing complex patterns in the training data BUT increases the risk of overfitting
#High Values: simpler trees with fewer splits, which can help reduce overfitting (i.e. lower variance) BUT can lead to higher bias

In [None]:
rfr_cv = GridSearchCV(RFR, param_grid_RFR, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
#Performs cross-validation grid search to find the best hyperparameters for each model 
#RFR: RandomForestRegressor
#cv=5: Number of cross-validation folds.
#CV is a technique used in machine learning to evaluate how well a model generalizes to an independent dataset
#Cross-validation involves splitting the dataset into multiple subsets or folds.
#Each fold serves as a separate training and validation set in turn.
#scoring='neg_mean_squared_error': Evaluation metric to optimize (here, negative mean squared error).
#n_jobs=-1: Uses all available CPU cores for parallel processing.

In [None]:
rfr_cv.fit(X_train, y_train)
#Purpose: Executes the grid search over the parameter grid (param_grid_RFR) using the training data (X_train, y_train).

In [None]:
np.sqrt(-1 * rfr_cv.best_score_)
#Computes the root mean squared error of the best model found during grid search.
#rfr_cv.best_score_: Retrieves the best cross-validation score (negative mean squared error).
#np.sqrt(-1 * rfr_cv.best_score_): Converts the negative mean squared error back to RMSE.

In [None]:
rfr_cv.best_params_

https://www.youtube.com/watch?v=OtD8wVaFm6E&t=37s&ab_channel=StatQuestwithJoshStarmer

When building XGBoost Trees for Regression, we calculate Similarity Scores and Gain to determine how to split the data...
and we prune the tree by calculating the differences between Gain values and a user defined Tree Complexity Parameter, gamma.
If gain - gamma > 0, do not prune.
If gain - gamma < 0, prune.
Then calculate the Output Values for the remaining leaves...
and lastly, lambda is a Regularization Parameter and when lambda > 0, it results in more pruning, by shrinking the Similarity Scores, and it results in smaller Output Values for the leaves.
Output Value = (Sum of Residuals) / (Residuals + lambda)

1. Calculate residuals for train data
residual = data pt - 0.5
0.5 is the default prediction value
All Residuals go to the Root
2. Calculate a Similarity Score for the Root
Similarity Score = (Sum of Residuals)^2 / (No of Residuals + lambda)
3. Build a tree using first 2 samples e.g. root = Dosage < 15 (avg of sample 1 & 2)
4. Residuals of all samples go to the respective leaves
Calculate a Similarity Score for the leaves
When the Residuals are very different, they cancel each other out and the Similarity Score is relatively small
When there is only 1 residual or the residuals are similar, they do not cancel out and the Similarity Score is relatively large
5. Quantify how much better the leaves cluster similar Residuals than the root by calculating the Gain of tree
Gain = Left Similarity + Right Similarity - Root Similarity
6. Compare it to the Gain calculated for other thresholds e.g. Dosage < 22.5 (avg of sample 2 & 3)
7. Bigger Gain = better at splitting the Residuals into clusters of similar values
since dosage < 15 has the largest gain, we will use that threshold for the first branch in the tree
8. Repeat steps 1 to 7 for the leaves of the tree with the biggest Gain 
Note: default height of tree = 6
9. After the tree has been built, we prune the tree based on the gain value
Calculate the difference between the Gain of the lowest branch in the tree and gamma.
Starting from the lowest/deepest branch, remove the branch if (Gain - gamma) < 0.
Keep pruning for all leaves and the root
If all leaves and root is removed, we are left with the original prediction = 0.5
When lambda is a greater value, it is easier to prune leaves because the values for Gain are smaller i.e. prevents overfitting the values
10. Calculate the output values for all leaves 
Output Value = (Sum of Residuals) / (No of Residuals + lambda)
11. New residual = 0.5 + 0.3 x output value
0.5 is the default prediction value
0.3 is the default learning rate
12. Repeat steps 1 to 11 using the new Residuals and build more and more trees until the Residuals are super small or we have reached the max no of trees
13. Final Prediction = 0.5 + 0.3 x output value 1 + 0.3 x output value 2 +...

In [None]:
XGB = XGBRegressor(random_state=13)

In [None]:
param_grid_XGB = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [300],
    'max_depth': [3],
    'min_child_weight': [1,2,3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}
#learning_rate: Step size shrinkage used to prevent overfitting.
#n_estimators: Number of boosting rounds (trees) to build.
#max_depth: Maximum depth of a tree.
#min_child_weight: Minimum sum of instance weight (hessian) needed in a child.
#gamma: Minimum loss reduction required to make a further partition on a leaf node.
#subsample: Subsample ratio of the training instance.
#colsample_bytree: Subsample ratio of columns when constructing each tree.

In [None]:
xgb_cv = GridSearchCV(XGB, param_grid_XGB, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
xgb_cv.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * xgb_cv.best_score_)

https://www.youtube.com/watch?v=Q81RR3yKn30&t=0s&ab_channel=StatQuestwithJoshStarmer

The main idea behind Ridge regression is to find a New Line that doesn't fit the Training Data as well (i.e. introduce a small amount of Bias into how the New Line is fit to the data)
In return for that small amount of Bias, we get a significant drop in Variance (i.e. minimise sum of squared residuals).
useful when there is a small amount of training data

when Least Squares determines the equation of the best fit line, it minimizes 
sum of squared residuals
when Ridge Regression determines the equation of the best fit line, it minimizes 
sum of squared residuals + lambda x gradient^2

try a bunch of lambda values and use 10-fold cross validation to determine which lambda results in the lowest Variance.

when applied to discrete variables, Ridge Regression minimizes the 
sum of squared residuals + lambda x (difference in mean values of the discrete variables)^2

when applied to logistic regression, Ridge Regression minimizes the sum of the likelihoods instead of the 
sum of squared residuals + lambda x gradient^2

1. start by collecting Weight and Size measurements from a bunch of mice
2. Start with a Linear Regression, aka Least Squares, to model the relationship between Weight and Size

In [None]:
ridge = Ridge()

In [None]:
param_grid_ridge = {
    'alpha': [0.05, 0.1, 1, 3, 5, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']
}
#alpha: Regularization strength; a higher value penalizes large coefficients more heavily.
#solver: Algorithm to use in the optimization problem.
#'auto': Chooses the solver automatically based on the data.
#'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag': Different algorithms for solving the optimization problem.

In [None]:
ridge_cv = GridSearchCV(ridge, param_grid_ridge, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
ridge_cv.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * ridge_cv.best_score_)

Adaboost: https://www.youtube.com/watch?v=LsK-xG1cLYA&ab_channel=StatQuestwithJoshStarmer
* In a forest of trees made with Adaboost, the trees are usually just a node and 2 leaves (stumps > Not good at making classifications + weak learners)
* In a forest of stumps made with Adaboost, some stumps get more say in the final classification than others.
* In a forest of stumps made with Adaboost, order is important e.g. the errors that the first stump makes influence	how the second stump is made

At the start, all samples get the same weight i.e. samples are all equally important.
Create stumps 
Calculate the Gini index for each stump 
Stump with lowest Gini index will be the first stump in the forest.
The total error for a stump is the sum of the weights associated with the incorrectly classified samples.
Lower total error > Higher amount of say for that stump.
Now, modify the weights so that the next stump will take the errors that the current stump made into account.
If current stump incorrectly classify the sample, emphasize the need for the next stump to correctly classify the sample by increasing the current stump's Sample Weight and decreasing all other Sample Weights
Start by increasing the Sample Weight for the incorrectly classified sample.
Decrease the Sample Weights for all of the correctly classified samples.
Normalize the weights so they all add up to 1
Use modified Sample Weights to make the second stump in the forest.
Make a new set of samples by randomly generating a value from 0 to 1 until you populate a table with the same number of rows.
Sample with larger Sample Weight is more likely to be added multiple times into the new table.
Use new collection of samples and give them all equal Sample Weights.
Repeat...
Add up the amount of say for all stumps that classify the patient as having heart disease.
Add up the amount of say for all stumps that classify the patient as not having heart disease.
If total amount of say (heart disease) > total amount of say (no heart disease), patient is classified as having heart disease.

Bias and Variance: https://www.youtube.com/watch?v=EuBBz3bI-aA&ab_channel=StatQuestwithJoshStarmer
Predict mouse Height against Weight
The inability for a machine learning method (like Linear Regression) to capture the true relationship is called BIAS.
Higher inability to capture the true relationship = Higher bias
Another machine learning method might fit a Squiggly Line to the training set
Lower inability to capture the true relationship = Lower bias
Calculate Sums of Squares for the Training set
Sums of Squares (Best fit line) > Sums of Squares (squiggly line)
Best fit line has more bias than squiggly line
Calculate Sums of Squares for the Testing set
Calculate the difference in Sums of Squares for the Testing set and Training set
Difference in Sums of Squares (Best fit line) < Difference in Sums of Squares (squiggly line)
Squiggly line has more variance (overfit) than best fit line
i.e. hard to predict how well the squiggly line will perform with future data sets. 

Gradient boost: https://www.youtube.com/watch?v=3CC4N4z3GJc&ab_channel=StatQuestwithJoshStarmer
Gradient Boost starts by making a single leaf instead of a tree or stump.
This leaf represents an initial guess for y of all the samples.
When trying to predict a continuous y value, the first guess is the average value.
Then Gradient Boost builds a tree based on the errors made from the previous tree.
Like Adaboost, this tree is based on the errors made by the previous tree, but unlike Adaboost, this tree is usually larger than a stump.
That said, Gradient Boost still restricts the size of the tree e.g. no more than 8-32 leaves.
Like Adaboost, Gradient Boost scales the trees, but unlike Adaboost, Gradient Boost scales all trees by the same amount.

Given a table of data with cols: Height, Fav color, Gender & Weight:
1. Get avg weight (predicted weight) e.g. 71.2
2. Generate a new col called Residual (Observed weight - predicted weight)
3. Now build a tree with Height, Fav color & Gender to predict the residuals INSTEAD of weights
By restricting the total no of leaves, we get fewer leaves than Residuals i.e. 2 rows of data go to the same leaf
Replace the residuals with their avg
Root: Gender = F
Left: Height < 1.6; Right: Color not Blue
4. Combine the original leaf with the new tree to make a new Prediction of an individual's Weight from the Training Data
5. Given the avg weight (71.2), run it down the tree to get the residual (16.8)
6. Make a new Prediction by starting with the initial Prediction (71.2), then add the scaled amount from the first tree
e.g. Predicted weight = 71.2 + 16.8 = 88
7. However, if predicted weight = observed weight, model may fit training data too well (i.e. low bias high variance)
8. Gradient Boost deals with this problem by using a Learning Rate (e.g. 0.1) to SCALE the contribution from the new tree.
9. Predicted Weight = 71.2 + 0.1 x 16.8 = 72.9
10. Build another tree by repeating the previous steps but calculate the Residual by using Observed weight - NEW predicted weight instead
e.g. 88 - 72.9
11. Make a new Prediction by starting with the initial Prediction (71.2), then add the scaled amount from the first tree and the scaled amount from the second tree.
e.g. Predicted weight = 71.2 + 0.1 x 16.8 + 0.1 x 15.1 = 74.4
12. Repeat step 10 and 11 (the residuals will get smaller and smaller as more and more trees are built) until we reach the max no of trees specified or adding more trees does not greatly reduce the Residuals
13. Now we can use our initial Prediction (71.2) and the chain of trees and predict weight for some new measurements (testing data)

In [None]:
GBR = GradientBoostingRegressor()

In [None]:
param_grid_GBR = {
    'max_depth': [12, 15, 20],
    'n_estimators': [200, 300, 1000],
    'min_samples_leaf': [10, 25, 50],
    'learning_rate': [0.001, 0.01, 0.1],
    'max_features': [0.01, 0.1, 0.7]
}
#max_depth: Maximum depth of the individual trees.
#n_estimators: Number of boosting stages (trees) to be performed.
#min_samples_leaf: Minimum number of samples required to be at a leaf node.
#learning_rate: Step size shrinkage to prevent overfitting.
#max_features: Proportion of features to consider when looking for the best split.

In [None]:
GBR_cv = GridSearchCV(GBR, param_grid_GBR, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
GBR_cv.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * GBR_cv.best_score_)

LGBM is a machine learning algo that is designed to handle large-scale datasets efficiently and provides fast training times. 
It works by creating an ensemble of weak learners, such as decision trees, sequentially. 
It focuses on instances that are difficult to classify (i.e. larger gradients), allowing it to prioritise and improve the model's performance.
It also employs gradient-based one-side sampling, which further enhances its training speed.

LGBM is known for its high accuracy and ability to handle diverse data types. 
It automatically handles missing values and supports categorical features without requiring prior data transformation. 

LightGBM: Designed to be faster and more memory-efficient than XGBoost, especially on large datasets.
LightGBM: Natively supports categorical features, allowing for more efficient processing without the need for one-hot encoding.
LightGBM: Leaf-wise growth can lead to more complex models that might overfit if not properly regularized, but it can also capture complex patterns in the data better.

In [None]:
lgbm_regressor = lgb.LGBMRegressor()

In [None]:
param_grid_lgbm = {
    'boosting_type': ['gbdt', 'dart'],
    'num_leaves': [20, 30, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300]
}
#boosting_type: Type of boosting algorithm to use ('gbdt' for Gradient Boosting Decision Tree or 'dart' for Dropouts Meet Multiple Additive Regression Trees).
#num_leaves: Maximum number of leaves in a tree.
#learning_rate: Step size shrinkage to prevent overfitting.
#n_estimators: Number of boosting stages (trees) to be performed.

In [None]:
lgbm_cv = GridSearchCV(lgbm_regressor, param_grid_lgbm, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
lgbm_cv.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * lgbm_cv.best_score_)

One-Hot, Label, Target and K-Fold Target Encoding
https://www.youtube.com/watch?v=589nCGeWG1w
One-Hot, Label, Target and K-Fold Target Encoding are methods used to convert discrete variables/features (e.g. fav color) into numbers

One hot encoding: create new cols for blue, red and green with 0 and 1 as T/F values

Label encoding: assign 0 for blue, 1 for red and 2 for green
However, a ML algo may group red and green together if the root is fav col < 0.5

Target encoding: assign the weighted mean value of outcome variable for blue, red and green
weighted mean = (n x option mean + m x overall mean) / (n + m)
where n = no of rows of option mean e.g. blue,
m = weight for overall mean (predetermined)
However, results in Data Leakage (i.e. overfitted model) because each row's target value, the thing we want to predict, is used to modify the same row's value 

K-Fold Target Encoding encoding: reduces data leakage
K-Fold refers to splitting the data into K equal sized subsets
e.g. 2-Fold splits data into 2 subsets, A & B
1. use subset B and calculate weighted mean
2. replace blue in subset A with the B's weighted mean
3. use subset A and calculate weighted mean
4. replace blue in subset B with the A's weighted mean

https://www.youtube.com/watch?v=KXOTSkPL2X4

CatBoost avoids Leakage when encoding Categorical variables by treating each row of data as if it were being fed into the algorithm sequentially i.e. Ordered Target Encoding
e.g. CatBoost treats the first row with Blue as if that is all the data it has received so far i.e. ignore all other rows when Target Encoding the first occurrence of Blue
CatBoost encoding = (Option Count + 0.05) / (n + 1)
Option Count = sum of prev target variable values
n = no of rows of prev target variable values

1. Each time CatBoost creates a tree, the first thing it does is randomise the rows of the training data
2. Apply Ordered Target Encoding 
create new col called Bin
Assign 0 to the smallest value for Height and assign everything else 1
Use the Bin as target variable values
Remove Bin column
3. Set predicted values to be 0
4. Do down the rows sequentially and calculate residuals
5. Find a suitable threshold for root like in XGB
Update the leaves to be the average of the prev residuals 
Create new col called Output = avg of the prev residuals
CatBoost quantifies how good the predictions for each threshold are by calculating the Cosine Similarity between the Residuals Column (A) and the Leaf Output column (B)
Cosine Similarity = summation AB / [sqrt(summation A^2) x sqrt(summation B^2)]
Compare Cosine Similarity for all the thresholds
Threshold with the higher Cosine Similarity is selected
Note: in practice, when you have a lot of data, CatBoost ignores the first bunch of rows (as their outputs are usually 0) when calculating the Cosine Similarity
6. Repeat step 5 for the leaves
7. Update prediction column
New prediction = prediction + 0.1 x output
0.1 = default learning rate
8. Repeat steps 4 to 7

When CatBoost builds larger trees, it builds Oblivious or Symmetric Decision Trees.
A Symmetric Decision Tree (weak learner) uses the exact same threshold for each node in the same level.
Gradient Boosting: combining a bunch of weak learners to make decisions
Symmetric Decision Trees are faster at making predictions.

In [None]:
catboost = CatBoostRegressor(loss_function='RMSE', verbose=False)

In [None]:
param_grid_cat ={
    'iterations': [100, 500, 1000],
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.5]
}
#iterations: Number of boosting stages (trees) to be performed.
#depth: Depth of each tree (maximum depth of the decision tree).
#learning_rate: Step size shrinkage to prevent overfitting.

In [None]:
cat_cv = GridSearchCV(catboost, param_grid_cat, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

In [None]:
cat_cv.fit(X_train, y_train)

In [None]:
np.sqrt(-1 * cat_cv.best_score_)

In [None]:
vr = VotingRegressor([('gbr', GBR_cv.best_estimator_),
                      ('xgb', xgb_cv.best_estimator_),
                      ('ridge', ridge_cv.best_estimator_)],
                    weights=[2,3,1])
#combines predictions from multiple base estimators (GBR, XGB, Ridge) using weighted averaging.

In [None]:
vr.fit(X_train, y_train)

In [None]:
y_pred_vr = vr.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_vr, squared=False)

In [None]:
estimators = [
    ('gbr', GBR_cv.best_estimator_),
    ('xgb', xgb_cv.best_estimator_),
    ('cat', cat_cv.best_estimator_),
    ('lgb', lgbm_cv.best_estimator_),
    ('rfr', rfr_cv.best_estimator_),
]
#Defines a list of tuples, each containing a name and the best estimator from the 
#respective model (GBR, XGB, CatBoost, LightGBM, Random Forest) after hyperparameter tuning.

In [None]:
stackreg = StackingRegressor(
            estimators = estimators,
            final_estimator = vr
)
#estimators: A list of tuples where each tuple contains a name and an estimator object (GBR, XGB, CatBoost, LightGBM, Random Forest).
#final_estimator: meta-estimator (vr) that combines the predictions of the base estimators i.e. VotingRegressor (vr).

In [None]:
stackreg.fit(X_train, y_train)

In [None]:
y_pred_stack = stackreg.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred_stack, squared=False)

In [None]:
df_test_preprocess = pipeline.transform(test_df)
#Preprocesses the test dataset (test_df) using the pipeline (which includes data transformation 
#and feature engineering steps).

In [None]:
y_stacking = np.exp(stackreg.predict(df_test_preprocess))
#predicts house prices (y_stacking) for the preprocessed test data using the trained Stacking Regressor (stackreg)
#Since the model predicted log-transformed values (np.log1p(train_df['SalePrice']) was applied earlier), 
#np.exp() is used to revert the transformation and get the actual sale prices.

df_y_stacking_out = test_df[['Id']]
df_y_stacking_out['SalePrice'] = y_stacking
#Prepares a DataFrame (df_y_stacking_out) containing the Id column from the original test data 
#(test_df) and the predicted sale prices (y_stacking)

df_y_stacking_out.to_csv('submission.csv', index=False)