Sberbank Russian Housing Market

Installing necessary libraries

In [None]:
!pip install geneticalgorithm
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from geneticalgorithm import geneticalgorithm as ga
import matplotlib.pyplot as plt

# Load datasets
train = pd.read_csv('/content/train.csv', parse_dates=['timestamp'])
test = pd.read_csv('/content/test.csv', parse_dates=['timestamp'])
macro = pd.read_csv('/content/macro.csv', parse_dates=['timestamp'])

Collecting geneticalgorithm
  Downloading geneticalgorithm-1.0.2-py3-none-any.whl.metadata (25 kB)
Collecting func-timeout (from geneticalgorithm)
  Downloading func_timeout-4.3.5.tar.gz (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading geneticalgorithm-1.0.2-py3-none-any.whl (16 kB)
Building wheels for collected packages: func-timeout
  Building wheel for func-timeout (setup.py) ... [?25l[?25hdone
  Created wheel for func-timeout: filename=func_timeout-4.3.5-py3-none-any.whl size=15076 sha256=964264e8c34ed24643dd57e5d3269b3ec9e713cf04b757dd619b4f74d13a19c2
  Stored in directory: /root/.cache/pip/wheels/3f/83/19/b5552bb9630e353f7c5b15be44bf10900afe1abbbfcf536afd
Successfully built func-timeout
Installing collected packages: f

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Merging data

In [None]:
# Merge macroeconomic data
macro_cols = [
    "timestamp", "balance_trade", "balance_trade_growth", "eurrub",
    "average_provision_of_build_contract", "micex_rgbi_tr", "micex_cbi_tr",
    "deposits_rate", "mortgage_value", "mortgage_rate", "income_per_cap",
    "museum_visitis_per_100_cap", "apartment_build"
]
train = train.merge(macro[macro_cols], on='timestamp', how='left')
test = test.merge(macro[macro_cols], on='timestamp', how='left')


Additional features

In [None]:
# Extract additional features from timestamp before imputation
for df in [train, test]:
    df['year'] = df['timestamp'].dt.year
    df['month'] = df['timestamp'].dt.month
    df['day_of_week'] = df['timestamp'].dt.dayofweek
# Drop timestamp column after feature extraction
train.drop(['timestamp'], axis=1, inplace=True)
test.drop(['timestamp'], axis=1, inplace=True)

Missing values

In [None]:
# Handle missing values
# Create a list of numerical features, excluding 'price_doc' and 'id' from the test set
numerical_features = train.drop(columns=['price_doc', 'id']).select_dtypes(include=np.number).columns.tolist()

# Create a SimpleImputer instance with median strategy
imputer = SimpleImputer(strategy='median')

Fit and transform

In [None]:
# Fit and transform the imputer on numerical features only
train[numerical_features] = imputer.fit_transform(train[numerical_features])
test[numerical_features] = imputer.transform(test[numerical_features])

In [None]:
# Log-transform the target variable
y = np.log1p(train['price_doc'])
X = train.drop(['id', 'price_doc'], axis=1)
test_ids = test['id']
X_test = test.drop(['id'], axis=1)

Non numeric features

In [None]:
# Handle non-numeric features (e.g., 'product_type')

# Create a LabelEncoder instance
encoder = LabelEncoder()

# Combine 'product_type' from both train and test data for fitting
all_product_types = pd.concat([X['product_type'], X_test['product_type']], ignore_index=True)

# Convert all_product_types to string type to handle mixed types and NaNs
all_product_types = all_product_types.astype(str)

# Fit the encoder on the combined data
encoder.fit(all_product_types)

# Transform 'product_type' in both train and test data
X['product_type'] = encoder.transform(X['product_type'].astype(str)) # Ensure consistent type during transform
X_test['product_type'] = encoder.transform(X_test['product_type'].astype(str)) # Ensure consistent type during transform

Conversion

In [None]:
# Handle non-numeric features
# Convert all object (string) type columns to numerical using Label Encoding
for col in X.select_dtypes(include=['object']).columns:
    # Create a LabelEncoder instance
    encoder = LabelEncoder()

    # Combine data from both train and test for fitting
    all_data = pd.concat([X[col], X_test[col]], ignore_index=True)

    # Convert to string type to handle mixed types and NaNs
    all_data = all_data.astype(str)

    # Fit the encoder on the combined data
    encoder.fit(all_data)

    # Transform the column in both train and test data
    X[col] = encoder.transform(X[col].astype(str))
    X_test[col] = encoder.transform(X_test[col].astype(str))

# Now you can scale the data:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [None]:
Model evaluation- define function

In [None]:
# Define function to evaluate model performance
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

Cross validation

In [None]:
# Split data for cross-validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Define models

In [None]:
# Define models for experimentation
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Machines': SVR(),
    'LightGBM': lgb.LGBMRegressor(random_state=42)
}

Hyperparameter tuning

In [None]:
# Hyperparameter tuning and model evaluation
best_models = {}
for name, model in models.items():
    print(f"Tuning {name}...")
    if name == 'Random Forest':
        params = {'n_estimators': [100, 200], 'max_depth': [10, 20]}
    elif name == 'Gradient Boosting':
        params = {'learning_rate': [0.01, 0.1], 'n_estimators': [100, 200]}
    elif name == 'Support Vector Machines':
        params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
    elif name == 'LightGBM':
        params = {'learning_rate': [0.01, 0.1], 'n_estimators': [100, 200]}

    grid = GridSearchCV(model, params, cv=3, scoring='neg_mean_squared_log_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"Best {name} Model: {grid.best_params_}")

Tuning Random Forest...
Best Random Forest Model: {'max_depth': 10, 'n_estimators': 200}
Tuning Gradient Boosting...
Best Gradient Boosting Model: {'learning_rate': 0.1, 'n_estimators': 200}
Tuning Support Vector Machines...




Best Support Vector Machines Model: {'C': 1, 'kernel': 'rbf'}
Tuning LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018022 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42245
[LightGBM] [Info] Number of data points in the train set: 24376, number of used features: 304
[LightGBM] [Info] Start training from score 15.612234
Best LightGBM Model: {'learning_rate': 0.1, 'n_estimators': 100}


Evaluate models

In [None]:
# Evaluate models on validation data
for name, model in best_models.items():
    y_pred = model.predict(X_val)
    print(f"{name} RMSLE: {rmsle(y_val, np.expm1(y_pred))}")

Random Forest RMSLE: 12.799684451183952
Gradient Boosting RMSLE: 12.797154617075973
Support Vector Machines RMSLE: 12.901403773164619
LightGBM RMSLE: 12.803517644238045


Stack models

In [None]:
# Stacking models
class StackingModel(BaseEstimator, RegressorMixin):
    def __init__(self, base_models, meta_model):
        self.base_models = base_models
        self.meta_model = meta_model

    def fit(self, X, y):
        self.base_models_ = [model.fit(X, y) for model in self.base_models]
        meta_features = np.column_stack([model.predict(X) for model in self.base_models_])
        self.meta_model_ = self.meta_model.fit(meta_features, y)
        return self

    def predict(self, X):
        meta_features = np.column_stack([model.predict(X) for model in self.base_models_])
        return self.meta_model_.predict(meta_features)

stacked_model = StackingModel(
    base_models=list(best_models.values()),
    meta_model=Ridge()
)
stacked_model.fit(X_train, y_train)
stacked_pred = stacked_model.predict(X_val)
print(f"Stacked Model RMSLE: {rmsle(y_val, np.expm1(stacked_pred))}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42245
[LightGBM] [Info] Number of data points in the train set: 24376, number of used features: 304
[LightGBM] [Info] Start training from score 15.612234
Stacked Model RMSLE: 12.800290700809185


Kaggle submission

In [None]:
# Generate Kaggle submission
stacked_test_pred = np.expm1(stacked_model.predict(X_test))
submission = pd.DataFrame({'id': test_ids, 'price_doc': stacked_test_pred})
submission.to_csv('stacked_submission.csv', index=False)