In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/train_clean.csv")

# если в очищенном файле уже есть LogSalePrice, эту строку можно не писать,
# но можно оставить — она перезапишет столбец тем же значением:
df["LogSalePrice"] = np.log1p(df["SalePrice"])

print("NaNs:", df.isnull().sum().sum())


NaNs: 7480


In [9]:
df["LogSalePrice"] = np.log1p(df["SalePrice"])


In [10]:
numeric_features = df.select_dtypes(include=["int64", "float64"]).columns
categorical_features = df.select_dtypes(include=["object"]).columns


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = df.copy()

df_scaled[numeric_features] = scaler.fit_transform(df[numeric_features])

df_scaled.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LogSalePrice
0,-1.730865,0.073375,RL,-0.231877,-0.207142,Pave,,Reg,Lvl,AllPub,...,,,,-0.087688,-1.599111,0.138777,WD,Normal,0.347273,0.560067
1,-1.728492,-0.872563,RL,0.437043,-0.091886,Pave,,Reg,Lvl,AllPub,...,,,,-0.087688,-0.48911,-0.614439,WD,Normal,0.007288,0.212763
2,-1.72612,0.073375,RL,-0.098093,0.07348,Pave,,IR1,Lvl,AllPub,...,,,,-0.087688,0.990891,0.138777,WD,Normal,0.536154,0.734046
3,-1.723747,0.309859,RL,-0.45485,-0.096897,Pave,,IR1,Lvl,AllPub,...,,,,-0.087688,-1.599111,-1.367655,WD,Abnorml,-0.515281,-0.437383
4,-1.721374,0.073375,RL,0.615421,0.375148,Pave,,IR1,Lvl,AllPub,...,,,,-0.087688,2.100892,0.138777,WD,Normal,0.869843,1.014651


In [20]:
df_model = pd.get_dummies(df_scaled, columns=categorical_features, drop_first=True)
df_model.head()
df_model = df_model.drop(columns=["Id", "SalePrice"])



In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Target
y = df_model["LogSalePrice"]
X = df_model.drop("LogSalePrice", axis=1)

# Train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_valid)

# Metrics
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
rmse


np.float64(0.5427336082338047)

In [22]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

pred_ridge = ridge.predict(X_valid)

rmse_ridge = np.sqrt(mean_squared_error(y_valid, pred_ridge))
rmse_ridge


np.float64(0.3491530797009191)

In [23]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.0005)
lasso.fit(X_train, y_train)

pred_lasso = lasso.predict(X_valid)
rmse_lasso = np.sqrt(mean_squared_error(y_valid, pred_lasso))
rmse_lasso


np.float64(0.3770680081156013)

In [25]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=150,   # было 500
    max_depth=10,       # ограничим глубину деревьев
    n_jobs=-1,          # использовать все ядра
    random_state=42
)

rf.fit(X_train, y_train)
pred_rf = rf.predict(X_valid)

rmse_rf = np.sqrt(mean_squared_error(y_valid, pred_rf))
rmse_rf



np.float64(0.3686282555117122)

In [26]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.03,
    max_depth=4,
    random_state=42
)

gbr.fit(X_train, y_train)
pred_gbr = gbr.predict(X_valid)

rmse_gbr = np.sqrt(mean_squared_error(y_valid, pred_gbr))
rmse_gbr


np.float64(0.3494626880929773)