In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
!cp /content/drive/MyDrive/2-folder/kaggle/df_utils.py /content/
import df_utils

In [None]:
# 1 - Preparing the Data:
# The first step is to prepare the data for modeling.
# This entails identifying the relevant features, cleaning the data,
# and dividing it into training and validation sets.

In [54]:
df_train = pd.read_csv('/content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/test.csv')

num_col = len(df_train.columns)
# there are 81 columns
print(f"num_col = {num_col}")
print(df_train["SalePrice"][:5])

# drop all columns with with more than 20% of missing values
percent_missing = df_train.isnull().sum() * 100 / len(df_train)
missing_value_df = pd.DataFrame({'column_name': df_train.columns,
                                 'percent_missing': percent_missing})
missing_value_df = missing_value_df.loc[missing_value_df['percent_missing'] >= 20]
print(missing_value_df)

num_col = 81
0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64
             column_name  percent_missing
Alley              Alley        93.767123
MasVnrType    MasVnrType        59.726027
FireplaceQu  FireplaceQu        47.260274
PoolQC            PoolQC        99.520548
Fence              Fence        80.753425
MiscFeature  MiscFeature        96.301370


In [None]:
# 2 - Model Selection:
# The following step is to choose the base models
# that will be used in the stacking ensemble.
# A broad selection of models is typically chosen to guarantee
# that they produce different types of errors and complement one another.

In [None]:
# 3 - Training the Base Models:
# After selecting the base models, they are trained on the training set.
# To ensure diversity, each model is trained using a different algorithm
# or set of hyperparameters.

In [None]:
# 4 - Predictions on the Validation Set:
# Once the base models have been trained,
# they are used to make predictions on the validation set.

In [None]:
# 5 - Developing a Meta Model:
# The next stage is to develop a meta-model, also known as a meta learner,
# which will take the predictions of the underlying models as input
# and make the final prediction. Any algorithm, such as linear regression,
# logistic regression, or even a neural network, can be used to create this model.

In [None]:
# 6 - Training the Meta Model:
# The meta-model is then trained using the predictions given by
# the base models on the validation set. The base models’ predictions
# serve as features for the meta-model.

In [None]:
# 7 - Making Test Set Predictions:
# Finally, the meta-model is used to produce test set predictions.
# The basic models’ predictions on the test set are fed into the meta-model,
# which then makes the final prediction.

In [None]:
# 8 - Model Evaluation: The final stage is to assess
# the stacking ensemble’s performance. This is accomplished
# by comparing the stacking ensemble’s predictions to the actual values
# on the test set using evaluation measures such as accuracy, precision,
# recall, F1 score, and so on.

In [None]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# # Load data
# train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

# # Separate features and target
# X = train.drop(['Id', 'SalePrice'], axis=1)
# y = train['SalePrice']

# # Split into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# # Preprocessing for numerical and categorical data
# numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
# categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

# numerical_transformer = SimpleImputer(strategy='mean')
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# # Define the model
# model = RandomForestRegressor(n_estimators=100, random_state=0)

# # Create and evaluate pipeline
# clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
# clf.fit(X_train, y_train)
# preds = clf.predict(X_valid)
# rmse = mean_squared_error(y_valid, preds, squared=False)

# print(f'Validation RMSE: {rmse}')

# # Prepare test data and make predictions for submission
# test_preds = clf.predict(test.drop(['Id'], axis=1))
# output = pd.DataFrame({'Id': test['Id'], 'SalePrice': test_preds})
# output.to_csv('submission.csv', index=False)