In [73]:
import os

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, MaxNLocator, FuncFormatter

from IPython.display import display
from IPython.display import clear_output
                                                                    
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors

from category_encoders import TargetEncoder

from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from scipy.stats import mode

# Кластеры для районов
# Таргет трайнуть

In [2]:
DATA_PATH = os.path.join("datasets", "Sber_housing_kaggle")
data_files = ["train.csv", "test.csv"]

In [3]:
def color_cells(val):
    """ val: значение признака """

    if val == 'float64':
        color = 'red'
    elif val == 'int64':
        color = 'red'
    else:
        color = 'blue'
    return f'color: {color}'

def valeraInfo(df):
    info = pd.DataFrame()
    info.index = df.columns
    info['Тип данных'] = df.dtypes
    info['Количесвто уникальных'] = df.nunique()
    info['Количество пропусков'] = df.isna().sum()
    info['Количество значений'] = df.count()
    info['%значений'] = round((df.count()/ df.shape[0]) * 100,2)
    info = info.style.applymap(color_cells, subset=['Тип данных'])
    return info

In [4]:
class DataFrameWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, transformer, columns):
        self.transformer = transformer
        self.columns = columns

    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        return self

    def transform(self, X):
        X_transformed = self.transformer.transform(X)
        return pd.DataFrame(X_transformed, columns=self.columns)

In [5]:
load_data = lambda path, files_names: (
    [pd.read_csv(os.path.join(path, file)) for file in files_names]
)

In [63]:
[Data_train, Data_test] = load_data(DATA_PATH, data_files)

In [64]:
# Code block
boolean_category_columns = [
    'culture_objects_top_25',
    'thermal_power_plant_raion',
    'incineration_raion',
    'oil_chemistry_raion',
    'radiation_raion',
    'railroad_terminal_raion',
    'big_market_raion',
    'nuclear_reactor_raion',
    'detention_facility_raion',
    'water_1line',
    'big_road1_1line',
    'railroad_1line',
]
binary_category_column = 'product_type'
multi_category_columns = [
    'sub_area',
    'ecology',
]
date_сolumn = 'timestamp'
sub_area_column = 'sub_area'
ecology_column = 'ecology'
target_column = 'price_doc'

In [74]:
X = Data_train.drop(columns=[target_column])
y = Data_train[target_column]

In [9]:
valeraInfo(X)

  info = info.style.applymap(color_cells, subset=['Тип данных'])


Unnamed: 0,Тип данных,Количесвто уникальных,Количество пропусков,Количество значений,%значений
id,int64,30471,0,30471,100.0
timestamp,object,1161,0,30471,100.0
full_sq,int64,211,0,30471,100.0
life_sq,float64,175,6383,24088,79.05
floor,float64,41,167,30304,99.45
max_floor,float64,49,9572,20899,68.59
material,float64,6,9572,20899,68.59
build_year,float64,119,13605,16866,55.35
num_room,float64,13,9572,20899,68.59
kitch_sq,float64,74,9572,20899,68.59


In [11]:
# X['ecology'] = X['ecology'].replace('no data', np.nan)
print(X['ecology'].unique())

['good' 'excellent' 'poor' 'satisfactory' 'no data']


In [76]:
date_column = 'timestamp'
year_column = 'year'

X = X.copy()
X[date_column] = pd.to_datetime(X[date_column], errors='coerce')
X[year_column] = X[date_column].dt.year
X = X.drop(columns=[date_column])

In [69]:
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_columns = X.select_dtypes(include=['number']).columns.tolist()

print(len(X.columns))
print(len(numerical_columns))
print(len(categorical_columns))

291
276
15


In [70]:
numeric_cols_with_missing_values = [col for col in numerical_columns if X[col].isna().any()]
print(len(numeric_cols_with_missing_values))

51


In [71]:
# Code block
binary_categories_columns = [binary_category_column] + boolean_category_columns
categorical_columns = [ecology_column] + binary_categories_columns + [sub_area_column]
all_columns = categorical_columns + numerical_columns
ecology_index = all_columns.index(ecology_column)

In [37]:
class KNNModeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
    
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.knn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.knn.fit(X[~np.isnan(X).any(axis=1)])
        return self
    
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        col_data = X[:, 0].reshape(-1, 1)
        nan_mask = np.isnan(col_data)
        
        # Ensure there are NaNs to be imputed
        if not nan_mask.any():
            return X
        
        # Find nearest neighbors for rows without NaN values
        distances, neighbors = self.knn.kneighbors(col_data[~nan_mask].reshape(-1, 1))
        
        for i, idx in enumerate(np.where(nan_mask)[0]):
            neighbor_vals = col_data[neighbors[i]].flatten()
            mode_result = mode(neighbor_vals, nan_policy='omit')
            count_value = mode_result.count if np.isscalar(mode_result.count) else mode_result.count[0]
            imputed_value = mode_result.mode if np.isscalar(mode_result.mode) else mode_result.mode[0]
            col_data[idx, 0] = imputed_value
        
        # Ensure column assignment back to X
        X[:, 0] = col_data.flatten()
        
        return X


In [16]:
class DataFrameConverter(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)

In [72]:
columns_for_knn_imputation = [col for col in all_columns if col != ecology_column]

processing_pipeline = Pipeline([
    (
        'encoding',
        ColumnTransformer([
            (
                'ecology_ordinal', 
                OrdinalEncoder(
                    categories=[['poor', 'satisfactory', 'good', 'excellent']], 
                    handle_unknown='use_encoded_value', 
                    unknown_value=np.nan
                ), 
                [ecology_column]
            ),
            (
                'binary_ohe',
                OneHotEncoder(sparse_output=False, drop='if_binary'),
                binary_categories_columns
            ),
            (
                'sub_area_target',
                TargetEncoder(),
                [sub_area_column]
            ),
            (
                'numerical',
                'passthrough',
                numerical_columns
            )
        ])
    ),
    ('scaling', StandardScaler()),
    ('to_dataframe_after_scaling', DataFrameConverter(columns=all_columns)),
    (
        'knn_imputation',
        ColumnTransformer([
            (
                'ecology',
                'passthrough',
                [ecology_column]
            ),
            (
                'knn_imputer',
                KNNImputer(n_neighbors=3),
                columns_for_knn_imputation
            )
        ])
    ),
    ('to_dataframe_after_knn_imputation', DataFrameConverter(columns=all_columns)),
    (
        'ecology_imputation',
        ColumnTransformer(
            transformers=[(
                'knn_mode_imputer',
                KNNModeImputer(n_neighbors=3),
                [ecology_column]
            )],
            remainder='passthrough'
        )
    )
])

In [77]:
# X_processed = processing_pipeline.fit_transform(X, y)
# X_processed_df = pd.DataFrame(X_processed, columns=all_columns)

In [78]:
# numeric_cols_with_missing_values = [col for col in numerical_columns if X_processed_df[col].isna().any()]
# cols_with_missing_values = [col for col in all_columns if X_processed_df[col].isna().any()]
# print("numeric columns with missing values:", len(numeric_cols_with_missing_values))
# print("all columns with missing values:", len(cols_with_missing_values))
# print("ecology column after imputation:", X_processed_df[ecology_column].unique())

In [43]:
# raise ValueError("This is a custom error message")

In [None]:
# # model = LinearRegression()
# # model = CatBoostRegressor(
# #     iterations=10, 
# #     depth=3, 
# #     learning_rate=0.3, 
# #     verbose=0,
# # )
# model = XGBRegressor(
#     n_estimators=10,    # Equivalent to CatBoost's iterations
#     max_depth=3,        # Similar to depth in CatBoost
#     learning_rate=0.3,  # Step size for each boosting step
#     verbosity=0         # Suppress training output
# )

# # Forward Selection
# sfs = SequentialFeatureSelector(model, direction='backward', n_features_to_select=5)
# sfs.fit(X, y)

# # Получаем выбранные признаки
# selected_indices  = sfs.get_support(indices=True)
# selected_features = X.columns[selected_indices].tolist()

# print("Selected features (Forward):", selected_features)

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(
    n_estimators=100,        # Number of trees
    learning_rate=0.1,       # Step size shrinkage used to prevent overfitting
    max_depth=6,             # Maximum tree depth
    subsample=0.8,           # Subsample ratio of the training instances
    colsample_bytree=0.8,    # Subsample ratio of columns for each split
    random_state=42
)

full_pipeline = Pipeline([
    ('preprocessing', processing_pipeline),  # Your preprocessing pipeline
    ('xgb', model)                           # XGBRegressor model as the final step
])

full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")

# Optional: display results
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(results.head())


# feature_importances = model.get_booster().get_score(importance_type='gain')

# importance_df = pd.DataFrame({
#     'Feature': list(feature_importances.keys()),
#     'Importance': list(feature_importances.values())
# }).sort_values(by='Importance', ascending=False)

# pd.set_option('display.max_rows', None)
# print("Ordered Feature Importances:")
# print(importance_df)

Mean Absolute Error (MAE): 1334187.1226004923
Mean Squared Error (MSE): 6126666904276.371
Root Mean Squared Error (RMSE): 2475210.476762809
R^2 Score: 0.713720440864563
         Actual   Predicted
4739   12319849  12994611.0
26793   6800000   5910074.5
6983    4650000   4592092.0
11307   1200000   5340045.0
20974   5650000   5402787.0


In [None]:
# Create the histogram
plt.figure(figsize=(25, 15))  # Adjust the size as needed
plt.hist(X_train["price_doc"], bins=500)

# Get the current axes object
ax = plt.gca()

# Use a logarithmic scale for the x-axis
ax.set_xscale('log')

# Define specific tick positions for custom intervals
# Ticks from 1M to 20M with steps of 1M, and from 20M to 100M with steps of 10M
ticks = np.concatenate([np.arange(1e6, 2.1e7, 1e6), np.arange(2e7, 1.1e8, 1e7)])

# Apply the tick positions to the x-axis
ax.set_xticks(ticks)

# Define a custom formatter to display x-axis labels in millions
ax.set_xticklabels([f'{int(tick / 1_000_000)}M' for tick in ticks])

# Rotate x-axis labels to 90 degrees for better readability
plt.xticks(rotation=90)

# Set the x-axis limits to start at 1 million
ax.set_xlim(left=1e6)

# Set labels and title
plt.xlabel('Price (in millions)')
plt.ylabel('Frequency')
plt.title('Histogram with Focused Logarithmic X-axis')

# Display the plot
plt.show()


In [None]:
corr_matrix = X_train.select_dtypes(include=[np.number]).corr()

In [None]:
corr_matrix

In [None]:
plt.figure(figsize=(10, 10))  # Adjust the figure size as needed
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(corr_matrix)


In [None]:
pd.set_option('display.max_info_columns', 300)
non_numeric_cols = X_train.select_dtypes(exclude=[np.number])
non_numeric_cols.info()