In [8]:
import numpy as np
import pandas as pd 
import sklearn 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

In [13]:
df=pd.read_csv("home-data-for-ml-course/train.csv",sep=",",engine="python",usecols=None)
df_test=pd.read_csv("home-data-for-ml-course/test.csv",sep=",",engine="python",usecols=None)
pd.set_option('display.max_rows', 100)

In [14]:
df.isna().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        872
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFin

In [12]:
df_test.isna().sum() 

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [58]:
def columnTypeChecker(df,threshold=30):
    columnTypes=np.zeros(len(df.columns))
    for j, name in enumerate(df.columns):
        if df[name].unique().size<threshold:
            columnTypes[j] = 0  # Categorical
        else:
            columnTypes[j] = 1  # Continuous

    return columnTypes

def DiscreteColumnChanger(df,vectorDiscrete):
    DiscreteIndex=GetDiscreteIndexes(vectorDiscrete)
    df.iloc[:,DiscreteIndex]=df.iloc[:,DiscreteIndex].fillna("None")
    return df

def GetDiscreteIndexes(vectorDiscrete):
    DiscreteIndex=[]
    
    for i in range(len(vectorDiscrete)): 
        if 0==vectorDiscrete[i]:
            DiscreteIndex.append(i) 
    return DiscreteIndex

def GetContinousIndexes(vectorDiscrete):
    DiscreteIndex=[]
    
    for i in range(len(vectorDiscrete)): 
        if 1==vectorDiscrete[i]:
            DiscreteIndex.append(i) 
    return DiscreteIndex


def floatToint(df):
    int_cols = df.select_dtypes(include=['int', 'int64', 'int32', 'int16', 'int8']).columns
    df.loc[:, int_cols] = df.loc[:, int_cols].astype(float)
    return df


In [59]:

#Train data 
specialTemp = ["OverallCond", "OverallQual", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "YrSold",
               "MoSold", "PoolArea", "3SsnPorch", "GarageCars", "Fireplaces", "TotRmsAbvGrd",
               "LowQualFinSF", "BsmtFullBath", "BsmtHalfBath", "FullBath"]

temp=df[specialTemp]

ToRemove= ["Id","PoolQC"]

df.drop(ToRemove+specialTemp,axis=1,inplace=True) 

DiscreteCont=columnTypeChecker(df)
df=DiscreteColumnChanger(df,DiscreteCont)

df=pd.concat([df,temp],axis=1)
df.fillna(0,inplace=True)
df=floatToint(df)

subcolumnIndexes=GetContinousIndexes(DiscreteCont)
subcolumnsCont= [  df.columns.tolist()[i] for i in subcolumnIndexes if i<len(df.columns.tolist())]  

subcolumnsCont+=specialTemp

subcolumnIndexes=GetDiscreteIndexes(DiscreteCont)
subcolumnsDiscrete= [  df.columns.tolist()[i] for i in subcolumnIndexes if i<len(df.columns.tolist())] 

df[subcolumnsCont]=df[subcolumnsCont].astype(np.float64)



Index(['MSSubClass', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
       'ScreenPorch', 'MiscVal', 'SalePrice', 'OverallCond', 'OverallQual',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'YrSold', 'MoSold',
       'PoolArea', '3SsnPorch', 'GarageCars', 'Fireplaces', 'TotRmsAbvGrd',
       'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath'],
      dtype='object')


In [79]:
#Test data 
dfTest=pd.read_csv("home-data-for-ml-course/test.csv",sep=",",engine="python",usecols=None)

testId=dfTest[["Id"]]

temp=dfTest[specialTemp]


dfTest.drop(ToRemove+specialTemp,axis=1,inplace=True) 

dfTest=pd.concat([dfTest,temp],axis=1)
dfTest.fillna(0,inplace=True)
dfTest=floatToint(dfTest)

subcolumnIndexes=GetContinousIndexes(DiscreteCont)
subcolumnsCont= [  dfTest.columns.tolist()[i] for i in subcolumnIndexes if i<len(dfTest.columns.tolist())]  

subcolumnsCont+=specialTemp

subcolumnIndexes=GetDiscreteIndexes(DiscreteCont)

subcolumnsDiscrete= [  dfTest.columns.tolist()[i] for i in subcolumnIndexes if i<len(dfTest.columns.tolist())] 

dfTest[subcolumnsCont]=dfTest[subcolumnsCont].astype(np.float64)



df["isTrain"] = 1
dfTest["isTrain"] = 0

combined = pd.concat([df.drop("SalePrice", axis=1,inplace=False), dfTest], ignore_index=True) 

combined_encoded=pd.get_dummies(combined,columns=subcolumnsDiscrete)

train_encoded = combined_encoded[combined_encoded["isTrain"]==1].drop("isTrain", axis=1)

test_encoded  = combined_encoded[combined_encoded["isTrain"]==0].drop("isTrain", axis=1)

train_encoded["SalePrice"] = df["SalePrice"]


Index(['MSSubClass', 'LotArea', 'YearBuilt', 'YearRemodAdd', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
       'ScreenPorch', 'MiscVal', 'OverallCond', 'OverallQual', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'YrSold', 'MoSold', 'PoolArea',
       '3SsnPorch', 'Fireplaces', 'TotRmsAbvGrd', 'LowQualFinSF', 'FullBath'],
      dtype='object')


In [9]:
plt.figure(figsize=(20, 14))
sns.heatmap(df[subcolumnsCont].corr(), annot=True, cmap="coolwarm", fmt=".1f", annot_kws={"size": 12})
plt.show()

In [62]:
X=train_encoded.drop(["SalePrice"],axis=1).copy() 
y=train_encoded["SalePrice"].copy() 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [63]:
#Forest
def RegressionForest(X_train, X_test, y_train, y_test):
    rf_regressor = RandomForestRegressor(n_estimators=100,  
                                     max_depth=5,      
                                     random_state=42,
                                     bootstrap=True, 
                                    n_jobs=-1,verbose=0 )

    rf_regressor.fit(X_train, y_train)  
    y_pred = rf_regressor.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2=r2_score(y_test,y_pred) 
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"r^2 : {r2:.4f}")
    return rf_regressor
    


In [None]:
BestRfRegressor=RegressionForest(X_train, X_test, y_train, y_test)
# Mean Absolute Error: 17779.7917
# Mean Squared Error: 1010861459.6279
# R² Score: 0.8682

In [14]:
#Grid Search Forest
def GridSearchRegressionForest(X_train, X_test, y_train, y_test): 
    param_grid = {
    'n_estimators': [50, 100, 200],  
    'max_depth': [5, 10, None],       
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 5, 10],   
    'max_features': [1, 'sqrt', 'log2'], 
    'bootstrap': [True, False],      
    'ccp_alpha': [0.0, 0.01, 0.1,10]    
    } 
    
    rf_regressor =RandomForestRegressor(random_state=42) 
    grid_search = GridSearchCV(rf_regressor , param_grid,
                           cv=5, scoring='neg_mean_squared_error',
                           n_jobs=3, verbose=1)
    grid_search.fit(X_train,y_train) 
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")
    return best_model

In [15]:
BestRfRegressor=GridSearchRegressionForest(X_train, X_test, y_train, y_test)
# Mean Absolute Error: 17779.7917
# Mean Squared Error: 1010861459.6279
# R² Score: 0.8682


In [16]:
rf_regressor.get_params()
# {'bootstrap': True,
#  'ccp_alpha': 0.0,
#  'criterion': 'squared_error',
#  'max_depth': 5,
#  'max_features': 1.0,
#  'max_leaf_nodes': None,
#  'max_samples': None,
#  'min_impurity_decrease': 0.0,
#  'min_samples_leaf': 1,
#  'min_samples_split': 2,
#  'min_weight_fraction_leaf': 0.0,
#  'monotonic_cst': None,
#  'n_estimators': 100,
#  'n_jobs': -1,
#  'oob_score': False,
#  'random_state': 42,
#  'verbose': 2,
#  'warm_start': False}

In [18]:
#ElasticNet
scaler=StandardScaler()
df_encoded_standarized=df_encoded.copy(deep=True)
df_encoded_standarized[subcolumnsCont]=scaler.fit_transform(df[subcolumnsCont])

X=df_encoded_standarized.drop(["SalePrice"],axis=1).copy() 
y=df_encoded_standarized["SalePrice"].copy() 

X_train_Standarized, X_test_Standarized, y_train_Standarized, y_test_Standarized = train_test_split(X, y, test_size=0.2, random_state=42)



In [19]:
def ElasticNetSearch(X_train, X_test, y_train, y_test):
    param_grid = {
    'alpha': [0.001, 0.01, 0,0.1, 1, 10],  
    'l1_ratio': [0,0.1, 0.3, 0.5, 0.7, 0.9,1]  
    }
    grid_search = GridSearchCV(ElasticNet(), param_grid, cv=5, scoring='r2', n_jobs=3, verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Test Set R² Score: {r2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return best_model

In [20]:
BestRegression=ElasticNetSearch(X_train_Standarized, X_test_Standarized, y_train_Standarized, y_test_Standarized)
# Test Set R² Score: 0.8781
# Mean Squared Error (MSE): 0.1482
# Mean Absolute Error (MAE): 0.2344

In [22]:
#GridSearch Adaboost
def GridSearchAdaboost(X_train,X_test,y_train,y_test):

    param_grid = {
    'n_estimators': [10, 50, 100, 200], 
    'learning_rate': [0.01, 0.1, 0.5, 1]   
    }
    TheModel=None 
    bestR2=0
    for i in [3,4,5,6,7]:
        base_learner = DecisionTreeRegressor(max_depth=i)
        adaboost=AdaBoostRegressor(estimator=base_learner,random_state=42) 
        grid_search=GridSearchCV(adaboost,param_grid,cv=5,scoring='r2', n_jobs=3,verbose=1)
        grid_search.fit(X_train,y_train)  
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        r2 = r2_score(y_test, y_pred)
        if TheModel is None or bestR2<r2 : 
            TheModel=best_model
            bestR2=r2
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
     
    print(f"Test Set R² Score: {bestR2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return TheModel

In [23]:
BestAdaBoost=GridSearchAdaboost(X_train,X_test,y_train,y_test)
# Test Set R² Score: 0.8977
# Mean Squared Error (MSE): 784337851.4621
# Mean Absolute Error (MAE): 18238.4651

In [24]:
# BestAdaBoost.get_params()
# {'estimator__ccp_alpha': 0.0,
#  'estimator__criterion': 'squared_error',
#  'estimator__max_depth': 6,
#  'estimator__max_features': None,
#  'estimator__max_leaf_nodes': None,
#  'estimator__min_impurity_decrease': 0.0,
#  'estimator__min_samples_leaf': 1,
#  'estimator__min_samples_split': 2,
#  'estimator__min_weight_fraction_leaf': 0.0,
#  'estimator__monotonic_cst': None,
#  'estimator__random_state': None,
#  'estimator__splitter': 'best',
#  'estimator': DecisionTreeRegressor(max_depth=6),
#  'learning_rate': 1,
#  'loss': 'linear',
#  'n_estimators': 100,
#  'random_state': 42}

In [25]:

def GridSearchSVR(X_train,X_test,y_train,y_test):
    # param_grid = {
    # 'C': [1, 10, 100],
    # 'epsilon': [0.01, 0.1, 1, 10],
    # 'kernel': ['linear', 'rbf']
    # }
    param_grid = {
    'C': [10, 20,40],
    'epsilon': [0.001,0.01, 0.05],
    'kernel': ['rbf']
    }
    
    grid_search = GridSearchCV(SVR(cache_size=2000), param_grid, cv=5, scoring='r2', n_jobs=4, verbose=2)
    grid_search.fit(X_train, y_train)
    best_svr = grid_search.best_estimator_
    
    y_pred = best_svr.predict(X_test)
    r2_best = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Test Set R² Score: {r2_best:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return best_svr



In [26]:
 BestSVR=GridSearchSVR(X_train_Standarized, X_test_Standarized, y_train_Standarized, y_test_Standarized)
 BestSVR.get_params()
# Test Set R² Score: 0.8737
# Mean Squared Error (MSE): 0.1536
# Mean Absolute Error (MAE): 0.2182
# {'C': 10,
#  'cache_size': 2000,
#  'coef0': 0.0,
#  'degree': 3,
#  'epsilon': 0.05,
#  'gamma': 'scale',
#  'kernel': 'rbf',
#  'max_iter': -1,
#  'shrinking': True,
#  'tol': 0.001,
#  'verbose': False}

In [29]:
# XGBoost

def GridSearchXGBoost(X_train,X_test,y_train,y_test):
    param_grid = {
    'n_estimators': [50, 100, 200],  
    'learning_rate': [0.01, 0.1, 0.2],  
    'max_depth': [3, 5, 7],  
    'subsample': [0.7, 1.0],  
    'colsample_bytree': [0.7, 1.0],  
    'gamma': [0, 0.1, 0.2],  
    'reg_alpha': [0, 0.1, 0.5, 1, 10]
    }

    # param_grid = {
    # 'n_estimators': [200,250],  
    # 'learning_rate': [0.05, 0.1, 0.15],  
    # 'max_depth': [7,8],  
    # 'subsample': [0.7],  
    # 'colsample_bytree': [0.5,0.7],  
    # 'gamma': [0.1],  
    # 'reg_alpha': [5,10,20]
    # }
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42,device="gpu") 
    grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='r2', n_jobs=3,verbose=1)
    grid_search.fit(X_train, y_train)

    best_Xgboost = grid_search.best_estimator_
    
    y_pred = best_Xgboost.predict(X_test)
    r2_best = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Test Set R² Score: {r2_best:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    return best_Xgboost
    

In [None]:
#BestXgBoost=GridSearchXGBoost(X_train,X_test,y_train,y_test)
#BestXgBoost.get_params()
# Test Set R² Score: 0.9140
# Mean Squared Error (MSE): 659584023.5164
# Mean Absolute Error (MAE): 15459.1116
# {'objective': 'reg:squarederror',
#  'base_score': None,
#  'booster': None,
#  'callbacks': None,
#  'colsample_bylevel': None,
#  'colsample_bynode': None,
#  'colsample_bytree': 0.7,
#  'device': None,
#  'early_stopping_rounds': None,
#  'enable_categorical': False,
#  'eval_metric': None,
#  'feature_types': None,
#  'gamma': 0.2,
#  'grow_policy': None,
#  'importance_type': None,
#  'interaction_constraints': None,
#  'learning_rate': 0.1,
#  'max_bin': None,
#  'max_cat_threshold': None,
#  'max_cat_to_onehot': None,
#  'max_delta_step': None,
#  'max_depth': 7,
#  'max_leaves': None,
#  'min_child_weight': None,
#  'missing': nan,
#  'monotone_constraints': None,
#  'multi_strategy': None,
#  'n_estimators': 200,
#  'n_jobs': None,
#  'num_parallel_tree': None,
#  'random_state': 42,
#  'reg_alpha': 10,
#  'reg_lambda': None,
#  'sampling_method': None,
#  'scale_pos_weight': None,
#  'subsample': 0.7,
#  'tree_method': None,
#  'validate_parameters': None,
#  'verbosity': None}

In [73]:
FinalRegressor= XGBRegressor(
    objective="reg:squarederror",
    colsample_bytree=0.7,
    gamma=0.2,
    learning_rate=0.1,
    max_depth=7,
    n_estimators=200,
    random_state=42,
    reg_alpha=10, 
    subsample=0.7
)
FinalRegressor.fit(X,y) 

In [75]:
price_preds=FinalRegressor.predict(test_encoded)

In [76]:
price_preds

array([125917.8 , 162920.77, 176239.38, ..., 160357.1 , 110810.31,
       236908.92], shape=(1459,), dtype=float32)

In [86]:
output=pd.DataFrame({'Id':testId.values.ravel() ,
                     'SalePrice':price_preds})
output.to_csv('submission.csv',index=False)