# Data Processing, Analysis and Preparation

In [10]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

# Read the data
train_data = pd.read_csv("/Users/neehanthreddym/Desktop/AI/Project/check point 2/data/train.csv")
test_data = pd.read_csv("/Users/neehanthreddym/Desktop/AI/Project/check point 2/data/test.csv")

# Separating categorical and numerical features
categorical_features = []
threshold = 20
for each in train_data.columns:
    if train_data[each].nunique() < threshold:
        categorical_features.append(each)

numerical_features = []
for each in train_data.columns:
    if each not in categorical_features:
        numerical_features.append(each)

# Handling Missing Values and Feature Engineering
alldata = pd.concat([train_data,test_data],axis=0,sort=False)

pd.set_option('display.max_rows', 100)
info_count = pd.DataFrame(alldata.isnull().sum(),columns=['Count of NaN'])
dtype = pd.DataFrame(alldata.dtypes,columns=['DataTypes'])
info = pd.concat([info_count,dtype],axis=1)

# Filling 433 LotFrontage values. I will use linear interpolation to fill these NaN values.
alldata['LotFrontage'].interpolate(method='linear',inplace=True)

# Filling other NaNs
for i in info.T:
    if i == "Id" or i == "SalePrice" or i == "LotFrontage":
        continue
    else:
        if (info.T[i][0] == 0):
            continue
        elif (info.T[i][0] < 400):
            alldata[i].fillna(alldata[i].value_counts().index[0], inplace = True)
        else:
            lbl_enc = LabelEncoder()
            lbl_enc.fit(list(alldata[i].values))
            alldata[i] = lbl_enc.transform(list(alldata[i].values))

# Handling the categorical columns because we will be using regression models algorithms.
list_ = ["MSZoning", "Street", "LotShape", "LandContour", "Utilities", "LotConfig",
        "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle",
        "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
        "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure",
        "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "CentralAir", "Electrical", "KitchenQual",
        "Functional", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "SaleType",
        "SaleCondition"]

for feature in list_:
    alldata[feature]= alldata[feature].astype("category")
    alldata = pd.get_dummies(alldata, columns=[feature])

pd.set_option('display.max_columns', 500)

# Selecting few rows for the model for train easily
train = alldata[0:1460]
test = alldata[1460:2920]

test = test.drop("SalePrice", axis=1)

# Define x(predictors) and y(target)
X = train.drop(["Id", "SalePrice"], axis=1)
y = np.log1p(train['SalePrice'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [11]:
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

X_train shape: (1095, 275)
y_train shape: (1095,)
X_test shape: (365, 275)
y_test shape: (365,)


# Modelling and Evaluation

In [12]:
import preprocess2
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold

# Defining the variables form preprocess
test = preprocess2.test
X_train = preprocess2.X_train
X_test = preprocess2.X_test
y_train = preprocess2.y_train
y_test = preprocess2.y_test

# Initialize K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Defining the models
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('RF', RandomForestRegressor()),
          ('SVR', SVR()),
          ('GBR', GradientBoostingRegressor())]


# Cross-Validation for RMSE
cv_results = []
for name, regressor in models:
    # Compute cross-validated RMSE
    neg_mse_scores = cross_val_score(regressor, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-neg_mse_scores)
    mean_rmse = np.mean(rmse_scores)
    std_rmse = np.std(rmse_scores)
    cv_results.append((name, mean_rmse, std_rmse))
    print(f"Cross-validated RMSE: {round(mean_rmse, 4)} ± {round(std_rmse, 4)} ({name})")

# Select the best model based on lowest RMSE
best_model_name, best_mean_rmse, _ = min(cv_results, key=lambda x: x[1])
print(f"\nBest Model: {best_model_name} with RMSE: {round(best_mean_rmse, 4)}")

# Export predictions using the best model
best_model = [model for name, model in models if name == best_model_name][0]
best_model.fit(X_train, y_train)

# Final predictions
X_test = test.drop(columns=['Id'])
y_pred = best_model.predict(X_test)

predictions_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': np.floor(np.exp(y_pred)).astype(int)
})

predictions_df.to_csv('/Users/neehanthreddym/Desktop/AI/Project/check point 2/Result/REG-03-CKPT2.csv', index=False)

Cross-validated RMSE: 0.1664 ± 0.0326 (LR)
Cross-validated RMSE: 0.1555 ± 0.0304 (Ridge)
Cross-validated RMSE: 0.2022 ± 0.0413 (Lasso)
Cross-validated RMSE: 0.197 ± 0.0394 (ElasticNet)
Cross-validated RMSE: 0.2324 ± 0.0129 (KNN)
Cross-validated RMSE: 0.1499 ± 0.0129 (RF)
Cross-validated RMSE: 0.211 ± 0.0151 (SVR)
Cross-validated RMSE: 0.1339 ± 0.0111 (GBR)

Best Model: GBR with RMSE: 0.1339


## Hyperparameter Tuning

In [None]:
import preprocess2
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn import metrics

# Defining the variables from preprocess
test = preprocess2.test
X_train = preprocess2.X_train
X_test = preprocess2.X_test
y_train = preprocess2.y_train
y_test = preprocess2.y_test

# Initialize K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Gradient Boosting Hyperparameter Tuning
gbr = GradientBoostingRegressor(random_state=42)
param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.05, 0.1, 0.2],
    "max_depth": [3, 4, 5],
}

grid_search = GridSearchCV(
    estimator=gbr,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=kf,
    verbose=1,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best model and parameters
best_gbr = grid_search.best_estimator_
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Evaluate the optimized model on the validation set
y_pred = best_gbr.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Optimized Test RMSE: {round(rmse, 4)}")

# Final predictions on the test dataset
X_test_final = test.drop(columns=['Id'])
y_pred_final = best_gbr.predict(X_test_final)

# Save predictions
predictions_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': np.floor(np.exp(y_pred_final)).astype(int)
})

predictions_df.to_csv('/Users/neehanthreddym/Desktop/AI/Project/check point 2/Result/REG-03-CKPT2.csv', index=False)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300}
Optimized Test RMSE: 0.1311
