In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
!cp /content/drive/MyDrive/2-folder/kaggle/df_utils.py /content/
import df_utils

In [None]:
# 1 - Preparing the Data:
# The first step is to prepare the data for modeling.
# This entails identifying the relevant features, cleaning the data,
# and dividing it into training and validation sets.

In [None]:
df_train_loaded = pd.read_csv('/content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/train.csv')
df_test_loaded = pd.read_csv('/content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/test.csv')

# num_col = len(df_train.columns)
# print(f"num_col = {num_col}")
# print(df_train["SalePrice"][:5])

df_train = df_train_loaded.drop(['Id'], axis=1)
df_test = df_test_loaded.drop(['Id'], axis=1)

# Drop all columns with with more than 20% of missing values
percent_missing = df_train.isnull().sum() * 100 / len(df_train)
missing_value_df = pd.DataFrame({'column_name': df_train.columns,
                                 'percent_missing': percent_missing})
missing_value_df = missing_value_df.loc[missing_value_df['percent_missing'] >= 20]
# print(missing_value_df)
df_train = df_train.drop(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
df_test = df_test.drop(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

# Select only numeric types
# print(df_train.dtypes)
df_train = df_train.select_dtypes(include=['int64', 'float64'])

# Calculate the correlation of features with the target
correlation = df_train.corr()
sorted_corr = correlation['SalePrice'].sort_values(ascending=False)
# print(sorted_corr)
columns = []
for i, v in sorted_corr.items():
  if v > 0.3 and i != 'SalePrice':
    print('index: ', i, 'value: ', v)
    columns.append(i)
# print(columns)

# Remove target label from training set
y_train = df_train['SalePrice'].values
# print(y[0:5])
df_train = df_train.drop(['SalePrice'], axis=1)

# Extract columns with high correlaton
df_train = df_train[columns]
df_test = df_test[columns]
print(df_train.head(5))

# Fill missing values for numerical columns with the median
df_train.fillna(df_train.mean(), inplace=True)
df_test.fillna(df_test.mean(), inplace=True)
print(y_train.shape)
print(df_train.shape)
print(df_test.shape)

# create X and y for training
X = df_train.values
y = y_train

# create train, validation and test splits
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

index:  OverallQual value:  0.7909816005838053
index:  GrLivArea value:  0.7086244776126515
index:  GarageCars value:  0.6404091972583519
index:  GarageArea value:  0.6234314389183622
index:  TotalBsmtSF value:  0.6135805515591943
index:  1stFlrSF value:  0.6058521846919153
index:  FullBath value:  0.5606637627484453
index:  TotRmsAbvGrd value:  0.5337231555820284
index:  YearBuilt value:  0.5228973328794967
index:  YearRemodAdd value:  0.5071009671113866
index:  GarageYrBlt value:  0.4863616774878596
index:  MasVnrArea value:  0.47749304709571444
index:  Fireplaces value:  0.46692883675152763
index:  BsmtFinSF1 value:  0.3864198062421535
index:  LotFrontage value:  0.35179909657067737
index:  WoodDeckSF value:  0.32441344456812926
index:  2ndFlrSF value:  0.31933380283206736
index:  OpenPorchSF value:  0.31585622711605504
   OverallQual  GrLivArea  GarageCars  GarageArea  TotalBsmtSF  1stFlrSF  \
0            7       1710           2         548          856       856   
1            

In [None]:
# 2 - Model Selection:
# The following step is to choose the base models
# that will be used in the stacking ensemble.
# A broad selection of models is typically chosen to guarantee
# that they produce different types of errors and complement one another.

In [None]:
# 3 - Training the Base Models:
# After selecting the base models, they are trained on the training set.
# To ensure diversity, each model is trained using a different algorithm
# or set of hyperparameters.

In [None]:
# Importa le librerie necessarie
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# # Carica i dati
# train = pd.read_csv("train.csv")
# test = pd.read_csv("test.csv")

# # Salva gli ID per la sottomissione
# test_ids = test["Id"]

# # Rimuovi la colonna 'Id' dai dati
# train.drop("Id", axis=1, inplace=True)
# test.drop("Id", axis=1, inplace=True)

# # Separazione delle feature e del target
# X = train.drop("SalePrice", axis=1)
# y = train["SalePrice"]

# # Identifica colonne numeriche e categoriche
# numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
# categorical_cols = X.select_dtypes(include=["object"]).columns

# # Preprocessing dei dati numerici: sostituzione dei valori mancanti
# num_imputer = SimpleImputer(strategy="median")
# X[numerical_cols] = num_imputer.fit_transform(X[numerical_cols])
# test[numerical_cols] = num_imputer.transform(test[numerical_cols])

# # Preprocessing dei dati categorici: sostituzione dei valori mancanti e codifica one-hot
# cat_imputer = SimpleImputer(strategy="most_frequent")
# X[categorical_cols] = cat_imputer.fit_transform(X[categorical_cols])
# test[categorical_cols] = cat_imputer.transform(test[categorical_cols])

# encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
# X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))
# test_encoded = pd.DataFrame(encoder.transform(test[categorical_cols]))

# # Mantieni gli indici originali per l'unione
# X_encoded.index = X.index
# test_encoded.index = test.index

# # Rimuovi le colonne categoriche originali e aggiungi quelle codificate
# X = X.drop(categorical_cols, axis=1)
# test = test.drop(categorical_cols, axis=1)
# X = pd.concat([X, X_encoded], axis=1)
# test = pd.concat([test, test_encoded], axis=1)

# Dividi i dati di addestramento e validazione
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Inizializza e addestra il modello XGBoost
model_0 = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model_0.fit(X_train, y_train)

# Valuta il modello
y_pred = model_0.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.2f}")

print(type(y_pred))
print(y_pred.shape)
print(y_pred[:5])
print(y_train[:5])
y_pred = np.expand_dims(y_pred, axis=1)
print(y_pred[:5])
print(y_pred.shape)

model0_pred = y_pred

# Addestra il modello su tutti i dati e fai predizioni sul set di test
# model.fit(X, y)
# predictions = model.predict(test)

# Crea il file di sottomissione
# submission = pd.DataFrame({"Id": test_ids, "SalePrice": predictions})
# submission.to_csv("submission.csv", index=False)
# print("File di sottomissione creato: submission.csv")


Validation RMSE: 28161.58
<class 'numpy.ndarray'>
(146,)
[142370.94 312801.12 114955.88 162205.28 317470.34]
[250000 187100 133900  67000 137500]
[[142370.94]
 [312801.12]
 [114955.88]
 [162205.28]
 [317470.34]]
(146, 1)


In [None]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_valid_tensor = torch.tensor(X_val, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_valid_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
# test_tensor = torch.tensor(test.values, dtype=torch.float32)

# Define the model
class HousePriceModel(nn.Module):
    def __init__(self, input_size):
        super(HousePriceModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.fc(x)

# Initialize the model
input_size = X_train.shape[1]
model_1 = HousePriceModel(input_size)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_1.parameters(), lr=0.001)

# Training loop
epochs = 3000
for epoch in range(epochs):
    model_1.train()
    optimizer.zero_grad()
    predictions = model_1(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Validation
    model_1.eval()
    with torch.no_grad():
        val_predictions = model_1(X_valid_tensor)
        val_loss = criterion(val_predictions, y_valid_tensor)

    if epoch % 20 == 0:
        print(f"Epoch {epoch}/{epochs}, Train Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}")

# Evaluate on validation set
model_1.eval()
with torch.no_grad():
    val_predictions = model_1(X_valid_tensor)
    rmse = np.sqrt(mean_squared_error(y_val, val_predictions.numpy()))
print(f"Validation RMSE: {rmse:.2f}")

val_predictions = val_predictions.numpy()
print(type(val_predictions))
print(val_predictions.shape)
print(val_predictions[:5])
print(y_train[:5])
# y_pred = np.expand_dims(val_predictions, axis=1)
# print(val_predictions[:5])
model1_pred = val_predictions

Epoch 0/3000, Train Loss: 38580862976.0000, Validation Loss: 42474012672.0000
Epoch 20/3000, Train Loss: 37700169728.0000, Validation Loss: 41515839488.0000
Epoch 40/3000, Train Loss: 34902269952.0000, Validation Loss: 38484254720.0000
Epoch 60/3000, Train Loss: 28025366528.0000, Validation Loss: 31052630016.0000
Epoch 80/3000, Train Loss: 16556163072.0000, Validation Loss: 18717530112.0000
Epoch 100/3000, Train Loss: 5928761856.0000, Validation Loss: 7334825472.0000
Epoch 120/3000, Train Loss: 3459739904.0000, Validation Loss: 4317097984.0000
Epoch 140/3000, Train Loss: 3485360896.0000, Validation Loss: 4232840192.0000
Epoch 160/3000, Train Loss: 3285896448.0000, Validation Loss: 4248665088.0000
Epoch 180/3000, Train Loss: 3323302144.0000, Validation Loss: 4127312640.0000
Epoch 200/3000, Train Loss: 3461856000.0000, Validation Loss: 4013596928.0000
Epoch 220/3000, Train Loss: 3275376384.0000, Validation Loss: 3914719744.0000
Epoch 240/3000, Train Loss: 3301137152.0000, Validation Loss

In [None]:
# 4 - Predictions on the Validation Set:
# Once the base models have been trained,
# they are used to make predictions on the validation set.

In [None]:
# 5 - Developing a Meta Model:
# The next stage is to develop a meta-model, also known as a meta learner,
# which will take the predictions of the underlying models as input
# and make the final prediction. Any algorithm, such as linear regression,
# logistic regression, or even a neural network, can be used to create this model.

print(model0_pred[:5])
print(model1_pred[:5])
X_meta = np.concatenate((model0_pred, model1_pred), axis=1)
print(X_meta[:5])
print(X_meta.shape)

X_train_meta = torch.tensor(X_meta, dtype=torch.float32)
# X_test_meta = torch.tensor(X_test, dtype=torch.float32)
y_train_meta = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)
# y_test_meta = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# Define the model
class MetaModel(nn.Module):
    def __init__(self, input_size):
        super(MetaModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.fc(x)

# Initialize the model
input_size = X_train_meta.shape[1]
model_meta = MetaModel(input_size)

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_meta.parameters(), lr=0.001)

# Training loop
epochs = 3000
for epoch in range(epochs):
    model_meta.train()
    optimizer.zero_grad()
    predictions = model_meta(X_train_meta)
    loss = criterion(predictions, y_train_meta)
    loss.backward()
    optimizer.step()

    # Validation
    # model.eval()
    # with torch.no_grad():
    #     val_predictions = model(X_test_meta)
    #     val_loss = criterion(val_predictions, y_test_meta)

    # if epoch % 20 == 0:
    #     print(f"Epoch {epoch}/{epochs}, Train Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}")

# Evaluate on validation set
model_meta.eval()
with torch.no_grad():
    predictions = model_meta(X_train_meta)
    rmse = np.sqrt(mean_squared_error(y_train_meta.numpy(), predictions.numpy()))
print(f"Train RMSE: {rmse:.2f}")

predictions = predictions.numpy()
print(type(predictions))
print(predictions.shape)
print(predictions[:5])
print(y_val[:5])
# y_pred = np.expand_dims(val_predictions, axis=1)
# print(val_predictions[:5])
model1_pred = val_predictions

[[142370.94]
 [312801.12]
 [114955.88]
 [162205.28]
 [317470.34]]
[[134369.72]
 [307358.06]
 [118869.3 ]
 [172219.05]
 [259012.22]]
[[142370.94 134369.72]
 [312801.12 307358.06]
 [114955.88 118869.3 ]
 [162205.28 172219.05]
 [317470.34 259012.22]]
(146, 2)
Train RMSE: 27023.91
<class 'numpy.ndarray'>
(146, 1)
[[144080.58]
 [315481.28]
 [115893.22]
 [164064.45]
 [326956.28]]
[154500 325000 115000 159000 315500]


In [None]:
# 6 - Training the Meta Model:
# The meta-model is then trained using the predictions given by
# the base models on the validation set. The base models’ predictions
# serve as features for the meta-model.

In [None]:
# 7 - Making Test Set Predictions:
# Finally, the meta-model is used to produce test set predictions.
# The basic models’ predictions on the test set are fed into the meta-model,
# which then makes the final prediction.
X = df_test.values
pred_0 = model_0.predict(X)
pred_0 = np.expand_dims(pred_0, axis=1)
pred_1 = model_1(torch.tensor(X, dtype=torch.float32))
pred_1 = pred_1.detach().numpy()

print(pred_0[:5])
print(pred_1[:5])
X_meta = np.concatenate((pred_0, pred_1), axis=1)
print(X_meta[:5])
print(X_meta.shape)
X_meta = torch.tensor(X_meta, dtype=torch.float32)
predictions = model_meta(X_meta)
predictions = predictions.detach().numpy()
predictions = np.squeeze(np.asarray(predictions))
print(predictions[:5])

predictions = pd.DataFrame({
        "Id": df_test_loaded["Id"],
        "SalePrice": predictions
    })

def make_submission(predictions):
    path="/content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/submission.csv"
    predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")

make_submission(predictions)

[[131184.11]
 [157972.42]
 [182508.44]
 [190153.8 ]
 [192972.12]]
[[151100.02]
 [180509.12]
 [188190.72]
 [191802.22]
 [165247.31]]
[[131184.11 151100.02]
 [157972.42 180509.12]
 [182508.44 188190.72]
 [190153.8  191802.22]
 [192972.12 165247.31]]
(1459, 2)
[135464.66 162673.83 183950.14 191535.25 197646.39]
Submission exported to /content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/submission.csv


In [None]:
# 8 - Model Evaluation: The final stage is to assess
# the stacking ensemble’s performance. This is accomplished
# by comparing the stacking ensemble’s predictions to the actual values
# on the test set using evaluation measures such as accuracy, precision,
# recall, F1 score, and so on.