In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

DATA PREPROCESSING

In [None]:
# Numerical columns must be separated from categorical ones
# Categorical ones can be one hot encoded or ordinal encoded
# First, delete columns with >80% null values if its correlation to target variable is <10%
# Second, null values in numerical columns must be replaced with the median value using SimpleImputer median strategy
# Third, null values in categorical columns must be replaced with the most common value, adding another column to say if the value was missing
# Fourth, non null entries in categorical columns must be one hot encoded if unique values are <= 3
# Otherwise, non entries in categorical columns must be ordinal encoded if a ranking exists
# Else, apply frequency encoding and normalize the values
# Numerical values must be normalized from 0 to 1, by taking each value and dividing it to the max value or MinMaxScaler
# Put all of this in a sklearn pipeline
# Finally, all the columns must be concatenated

In [None]:
# read the data
housing = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
housing.info()

In [None]:
# separate the target from the predictors
y = housing.SalePrice
X = housing.drop(["SalePrice"], axis=1)

In [None]:
# divide data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
# delete the id column, which provides no useful information and may pollute the model
X_train.drop('Id', axis=1, inplace=True)
X_valid.drop('Id', axis=1, inplace=True)

In [None]:
# filter out the columns with >70% of missing values
X_train = X_train[ [col for col in X_train.columns if X_train[col].notnull().sum() > 0.3 * X_train.shape[0]]]
X_valid = X_valid[X_train.columns]

In [None]:
# separate between numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=["int64","float64"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

In [None]:
# separate between high and low cardinality categorical columns
low_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() <= 3]
high_cardinality_cols = [col for col in categorical_cols if 4 <= X_train[col].nunique() < 10]

In [None]:
# drop columns that dont fit the criteria
final_columns = list(numerical_cols) + low_cardinality_cols + high_cardinality_cols

In [None]:
X_train = X_train[final_columns]
X_valid = X_valid[final_columns]

In [None]:
# pipeline for numerical columns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

In [None]:
# pipelines for categorical columns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

low_cardinality_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',
                             sparse_output=False))
]) 

high_cardinality_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ))
])

In [None]:
# bundle preprocessing
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('ohe', low_cardinality_transformer, low_cardinality_cols),
        ('ord', high_cardinality_transformer, high_cardinality_cols)
    ])

In [None]:
# prepare the data for pytorch
import torch
from sklearn.preprocessing import StandardScaler

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_valid_scaled = scaler_X.transform(X_valid)

scaler_Y = StandardScaler()
y_train_scaled = scaler_Y.fit_transform(y_train.values.reshape(-1,1)).squeeze()
y_valid_scaled = scaler_Y.transform(y_valid.values.reshape(-1,1)).squeeze()

In [None]:
# Principal Component Analysis in action
from sklearn.decomposition import PCA

# I keep the components that explain 95% of variance
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_valid_pca = pca.transform(X_valid_scaled)

In [None]:
X_train_tensor = torch.tensor(X_train_pca, dtype=torch.float32)
X_valid_tensor = torch.tensor(X_valid_pca, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid_scaled, dtype=torch.float32)

In [None]:
# linear regression model using SGD
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

# define the dataset for training
train_ds = TensorDataset(X_train_tensor, y_train_tensor)

# define the data loader
bs = 8 # batch size
train_dl = DataLoader(train_ds, bs, shuffle=True)

# define model
model = nn.Linear(X_train_pca.shape[1], 1)

# define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# define loss function as Mean Square Error
loss_fn = nn.MSELoss()

# the training loop
num_epochs = 10
for epoch in range(num_epochs):
    for x, y in train_dl:
        # produce the predictions
        predictions = model(x).squeeze()

        # clip the values so that the gradients dont explode
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=1.0)
        
        # obtain the loss
        loss = loss_fn(predictions, y)

        # compute the gradients
        loss.backward()

        # update the model parameters
        optimizer.step()

        # clear the gradients
        optimizer.zero_grad()

    with torch.no_grad():
        predictions = model(X_train_tensor).squeeze()
        epoch_loss = loss_fn(predictions, y_train_tensor)
        print(f'Epoch: {epoch} Loss: {epoch_loss.item()}')

loss = loss_fn(model(X_valid_tensor).squeeze(), y_valid_tensor)
print(loss.item())

In [None]:
# get the predictions on the test dataset
housing_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# preprocess the test dataset
X_test = housing_test[final_columns]
X_test = preprocessor.transform(X_test)
X_test_scaled = scaler_X.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)

# set the model to evaluation mode
model.eval()

# make sure gradients are not computed because this is the final model
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_pca, dtype=torch.float32)
    predictions = model(X_test_tensor).squeeze()

In [None]:
# submit the predictions
submission = pd.DataFrame({
    "Id": housing_test["Id"],
    "SalePrice": predictions.numpy()
})

submission.to_csv("submission.csv", index=False)