### Import Libraries

In [177]:
import numpy as np
import pandas as pd
import zipfile

### Data Downloading

In [178]:
zip_path = "/content/home-data-for-ml-course.zip"
extract_to = "data/"
# open the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # extract specific files
    zip_ref.extract("train.csv", extract_to)
    zip_ref.extract("test.csv", extract_to)

In [179]:
# train data and test data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

### Data Exploring

In [180]:
print(train_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [181]:
print(train_data.info())
print(train_data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

### Data Preprocessing
##### Fill missing value, apply log transform for numeric columns and one-hot encoding for categorical variables

In [182]:
def preprocess_data(train, test):
    # drop 'Id' column
    train.drop(columns=['Id'], inplace=True)
    test.drop(columns=['Id'], inplace=True)

    # identify numeric and categorical columns in both datasets
    num_cols_train = train.select_dtypes(include=['number']).columns
    num_cols_test = test.select_dtypes(include=['number']).columns
    cat_cols_train = train.select_dtypes(include=['object']).columns
    cat_cols_test = test.select_dtypes(include=['object']).columns

    # fill missing values

    # replace NaN with 0 for numeric columns
    train[num_cols_train] = train[num_cols_train].fillna(0)
    test[num_cols_test] = test[num_cols_test].fillna(0)

    # replace NaN with None for categorical columns
    train[cat_cols_train] = train[cat_cols_train].fillna('None')
    test[cat_cols_test] = test[cat_cols_test].fillna('None')

    # apply log transform (excluding 'SalePrice') to numeric features
    if 'SalePrice' in num_cols_train:
        train[num_cols_train.difference(['SalePrice'])] = np.log1p(train[num_cols_train.difference(['SalePrice'])])
        train['SalePrice'] = np.log1p(train['SalePrice'])
    else:
        train[num_cols_train] = np.log1p(train[num_cols_train])

    # apply log transformation to numeric columns in the test set
    test[num_cols_test] = np.log1p(test[num_cols_test])

    # convert categorical variables into one-hot encoded features (binary representation)
    train = pd.get_dummies(train, dtype=float)
    test = pd.get_dummies(test, dtype=float)

    return train, test


In [183]:
# Data Preprocessing
# call `preprocess_data()` to clean the data, handle missing values, apply log transformation, and perform one-hot encoding for both datasets
train_processed, test_processed = preprocess_data(train_data, test_data)

# split the dataset into features (X) and target (y - SalePrice)
X = train_processed.drop(columns=['SalePrice'])
y = train_processed['SalePrice']

# ensure the test set has the same columns as the train set
X, test_processed = X.align(test_processed, join='left', axis=1, fill_value=0)

# convert data to NumPy arrays
X = X.values  # feature matrix
y = y.values  # target values
X_test = test_processed.values  # convert test set to NumPy array

### Build Model
#### Using Ridge Regression

In [184]:
class RidgeRegression:
    def __init__(self, alpha=1.0):
        # alpha (float): parameters - higher values increase regularization
        self.alpha = alpha
        self.theta = None

    def fit(self, X, y):
        """ Train Ridge Regression model using Normal Equation """
        # add bias term (column of ones) to the feature matrix
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        # get the number of features (including bias)
        n_features = X_b.shape[1]

        # create an identity matrix for regularization
        I = np.eye(n_features)
        I[0, 0] = 0  # bias term should not be regularized

        # compute theta using normal equation
        A = X_b.T @ X_b + self.alpha * I  # (X^T * X + λI)
        b = X_b.T @ y                     #(X^T * y)

        # solve for theta using least squares
        self.theta, _, _, _ = np.linalg.lstsq(A, b, rcond=None)

        # return trained model
        return self

    # make predictions using trained model
    def predict(self, X):
        """ Make predictions using trained model """
        # add bias term (column of ones) to the feature matrix
        X_b = np.c_[np.ones((X.shape[0], 1)), X]

        # y_pred = X_b * theta
        return X_b.dot(self.theta)

### Model Training

In [185]:
# train model with L2 regularization (alpha = 1.0)
ridge_model = RidgeRegression(alpha=1.0)
# fit model to the training data
ridge_model.fit(X, y)

# make predictions
y_pred = ridge_model.predict(X_test)
# apply the exponential function to revert target back to the original scale.
y_pred = np.exp(y_pred)

### Export CSV File

In [186]:
# create a DataFrame for submission with proper format
submission = pd.DataFrame({'Id': test_data.index + 1461, 'SalePrice': y_pred})
# export csv file
submission.to_csv('submission.csv', index=False)
print("Export submission file successfully!")

Export submission file successfully!


## Kaggle Submission
#### Link submission: https://drive.google.com/file/d/12lfy8j4-1CbZv3ZgMH0SjvX_Okyhbxph/view?usp=drive_link