In [None]:
# ----------------------------
# IMPORTAÇÃO E CONFIGURAÇÕES
# ----------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from scipy.stats import skew
from scipy.special import boxcox1p

from sklearn.linear_model import Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')
sns.set(style="darkgrid")
%matplotlib inline

# ----------------------------
# LEITURA DOS DADOS
# ----------------------------
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

y_train = train_df['SalePrice']
train_ID = train_df['Id']
test_ID = test_df['Id']
train_df.drop(['SalePrice'], axis=1, inplace=True)

all_data = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

# ----------------------------
# EDA – DISTRIBUIÇÃO DE SALEPRICE
# ----------------------------
sns.histplot(train_df['SalePrice'], kde=True)
plt.title('Distribuição Original de SalePrice')
plt.show()
print(f"SKewness: {train_df['SalePrice'].skew()}")
print(f"SKewness: {train_df['SalePrice'].kurt()}")

correlation_matrix = train_df.corr(numeric_only=True)
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.show()
print(correlation_matrix['SalePrice'].sort_values(ascending=False).head(15))

# Exemplo com OverallQual
plt.figure(figsize=(8, 6))
sns.boxplot(x= 'OverallQual', y='SalePrice', data=train_df)
plt.title('SalePrice vs OverallQual')
plt.show()

plt.figure(figsize=(8, 6))
sns.scatterplot(x=train_df['GrLivArea'], y=train_df['SalePrice'])
plt.title('GrLivArea vs SalePrice')
plt.show()

total_missing = train_df.isnull().sum().sort_values(ascending=False)
percent_missing = (train_df.isnull().sum() / train_df.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([total_missing, percent_missing], axis=1, keys=['Total', 'Percentual'])
print(missing_data[missing_data['Total'] > 0])

plt.figure(figsize=(12, 6))
sns.heatmap(train_df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Visualização de Dados Faltantes no Treino')
plt.show()


# Train Dataset
total_missing_train = train_df.isnull().sum().sort_values(ascending=False)
percent_missing_train = (train_df.isnull().sum() / train_df.isnull().count()).sort_values(ascending=False)

missing_data_train = pd.concat([total_missing_train, percent_missing_train], axis=1, keys=['Total', 'Percentual'])
print("Dados faltantes no conjunto de treino:")
print(missing_data_train[missing_data_train['Total'] > 0])

plt.figure(figsize=(12, 6))
sns.heatmap(train_df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Visualização de Dados Faltantes no Treino')
plt.show()

# Test Dataset
total_missing_test = test_df.isnull().sum().sort_values(ascending=False)
percent_missing_test = (test_df.isnull().sum() / test_df.isnull().count()).sort_values(ascending=False)

missing_data_test = pd.concat([total_missing_test, percent_missing_test], axis=1, keys=['Total', 'Percentual'])
print("\nDados faltantes no conjunto de teste:")
print(missing_data_test[missing_data_test['Total'] > 0])

plt.figure(figsize=(12, 6))
sns.heatmap(test_df.isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Visualização de Dados Faltantes no Teste')
plt.show()




# ----------------------------
# TRATAMENTO DE DADOS FALTANTES
# ----------------------------
for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'MasVnrType']:
    all_data[col] = all_data[col].fillna("None")

for col in ['GarageYrBlt', 'GarageArea', 'GarageCars',
            'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

for col in ['MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType', 'Functional']:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

all_data = all_data.drop(['Utilities'], axis=1)

# ----------------------------
# ENCODING E TRANSFORMAÇÕES
# ----------------------------
num_to_cat = ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold']
for col in num_to_cat:
    all_data[col] = all_data[col].astype(str)

ord_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
for col in ord_cols:
    lbl = LabelEncoder()
    all_data[col] = lbl.fit_transform(all_data[col].astype(str))

# ----------------------------
# ENGENHARIA DE ATRIBUTOS
# ----------------------------
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewed = skewed_feats[abs(skewed_feats) > 0.75].index

for feat in skewed:
    all_data[feat] = boxcox1p(all_data[feat], 0.15)

all_data = pd.get_dummies(all_data)

# ----------------------------
# SEPARAÇÃO DOS DADOS
# ----------------------------
X_train = all_data[:len(y_train)]
X_test = all_data[len(y_train):]

# ----------------------------
# FUNÇÃO DE VALIDAÇÃO
# ----------------------------
def rmsle_cv(model):
    score = -cross_val_score(model, X_train, y_train_log, scoring="neg_root_mean_squared_error", cv=5)
    return score.mean()

# ----------------------------
# MODELOS PARA COMPARAÇÃO
# ----------------------------
models = {
    "Lasso": Lasso(alpha=0.0005, random_state=1),
    "ElasticNet": ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=3),
    "Kernel Ridge": KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                                    max_depth=4, max_features='sqrt',
                                                    min_samples_leaf=15, min_samples_split=10,
                                                    loss='huber', random_state=5),
    "XGBoost": xgb.XGBRegressor(learning_rate=0.05, n_estimators=3000,
                                max_depth=4, subsample=0.7,
                                colsample_bytree=0.7, random_state=7),
    "LightGBM": lgb.LGBMRegressor(objective='regression', num_leaves=5,
                                  learning_rate=0.05, n_estimators=3000,
                                  max_bin=55, bagging_fraction=0.8,
                                  bagging_freq=5, feature_fraction=0.2319,
                                  feature_fraction_seed=9, bagging_seed=9,
                                  min_data_in_leaf=6, min_sum_hessian_in_leaf=11)
}

print("RMSLE (validação cruzada 5-fold):")
for name, model in models.items():
    score = rmsle_cv(model)
    print(f"{name}: {score:.5f}")

# ----------------------------
# TREINAMENTO FINAL COM MELHOR MODELO (ex: Lasso)
# ----------------------------
final_model = models["Lasso"]
final_model.fit(X_train, y_train_log)
preds = np.expm1(final_model.predict(X_test))


final_predctions = lasso_pred
# ----------------------------
# CRIAÇÃO DO ARQUIVO DE SUBMISSÃO
# ----------------------------
submission = pd.DataFrame()
submission['Id'] = test_ID
submission['SalePrice'] = final_predictions
submission.to_csv('submission_house_pricess.csv', index=False)


KeyError: 'SalePrice'