In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

# ---------- 1. Загрузка данных ----------
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# целевая переменная в логах (только для train)
train["LogSalePrice"] = np.log1p(train["SalePrice"])

# ---------- 2. Обработка пропусков (ТО ЖЕ САМОЕ, ЧТО В 01_eda) ----------

# колонки, где NaN = "нет объекта"
no_object_columns = [
    "PoolQC","MiscFeature","Alley","Fence","FireplaceQu",
    "GarageType","GarageFinish","GarageQual","GarageCond",
    "BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2",
    "MasVnrType"
]

for col in no_object_columns:
    if col in train.columns:
        train[col] = train[col].fillna("None")
    if col in test.columns:
        test[col] = test[col].fillna("None")

# числовые NaN -> 0
for col in ["GarageYrBlt", "MasVnrArea"]:
    if col in train.columns:
        train[col] = train[col].fillna(0)
    if col in test.columns:
        test[col] = test[col].fillna(0)

# LotFrontage — медиана по Neighborhood (по отдельности для train и test)
if "LotFrontage" in train.columns:
    train["LotFrontage"] = train.groupby("Neighborhood")["LotFrontage"].transform(
        lambda x: x.fillna(x.median())
    )
if "LotFrontage" in test.columns:
    test["LotFrontage"] = test.groupby("Neighborhood")["LotFrontage"].transform(
        lambda x: x.fillna(x.median())
    )

# Electrical — мода
if "Electrical" in train.columns:
    train["Electrical"] = train["Electrical"].fillna(train["Electrical"].mode()[0])
if "Electrical" in test.columns:
    test["Electrical"] = test["Electrical"].fillna(test["Electrical"].mode()[0])

print("NaNs in train:", train.isnull().sum().sum())
print("NaNs in test: ", test.isnull().sum().sum())

# ---------- Доп. зачистка оставшихся NaN ----------

# числовые столбцы
num_cols_train = train.select_dtypes(include=["int64", "float64"]).columns
num_cols_test  = test.select_dtypes(include=["int64", "float64"]).columns

train[num_cols_train] = train[num_cols_train].fillna(0)
test[num_cols_test]   = test[num_cols_test].fillna(0)

# категориальные столбцы
cat_cols_train = train.select_dtypes(include=["object"]).columns
cat_cols_test  = test.select_dtypes(include=["object"]).columns

train[cat_cols_train] = train[cat_cols_train].fillna("None")
test[cat_cols_test]   = test[cat_cols_test].fillna("None")

print("NaNs in train after fix:", train.isnull().sum().sum())
print("NaNs in test after fix: ", test.isnull().sum().sum())


# ---------- 3. Масштабирование числовых признаков ----------

# все числовые признаки, кроме целевых колонок
numeric_features = train.select_dtypes(include=["int64", "float64"]).columns
numeric_features = [c for c in numeric_features if c not in ["SalePrice", "LogSalePrice"]]

scaler = StandardScaler()
train_scaled = train.copy()
test_scaled = test.copy()

train_scaled[numeric_features] = scaler.fit_transform(train[numeric_features])
test_scaled[numeric_features] = scaler.transform(test[numeric_features])

# ---------- 4. One-Hot Encoding категориальных ----------

categorical_features = train.select_dtypes(include=["object"]).columns

train_model = pd.get_dummies(train_scaled, columns=categorical_features, drop_first=True)
test_model  = pd.get_dummies(test_scaled,  columns=categorical_features, drop_first=True)

# ---------- 5. Формирование X и y, выравнивание колонок ----------

# целевая переменная
y = train_model["LogSalePrice"]

# все признаки, КРОМЕ Id, SalePrice, LogSalePrice
feature_cols = [c for c in train_model.columns if c not in ["Id", "SalePrice", "LogSalePrice"]]
X = train_model[feature_cols]

# на test оставляем ровно те же колонки (если каких-то нет — заполняем нулями)
X_test = test_model.reindex(columns=feature_cols, fill_value=0)

print("X shape:     ", X.shape)
print("X_test shape:", X_test.shape)

# ---------- 6. Обучение модели ----------

gbr = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.03,
    max_depth=4,
    random_state=42
)

gbr.fit(X, y)

# предсказания на тесте (в лог-пространстве)
pred_test_log = gbr.predict(X_test)

# обратное преобразование логарифма
pred_saleprice = np.expm1(pred_test_log)

# ---------- 7. Создание submission.csv ----------

submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": pred_saleprice
})

submission.to_csv("submission.csv", index=False)
submission.head()



NaNs in train: 0
NaNs in test:  22
NaNs in train after fix: 0
NaNs in test after fix:  0
X shape:      (1460, 259)
X_test shape: (1459, 259)


Unnamed: 0,Id,SalePrice
0,1461,120401.278188
1,1462,158173.835123
2,1463,191426.406341
3,1464,192898.228533
4,1465,180452.842323
