In [119]:
import os

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter, MaxNLocator, FuncFormatter

from IPython.display import display
from IPython.display import clear_output
                                                                    
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

from category_encoders import TargetEncoder

from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from scipy.stats import mode

# Кластеры для районов
# Таргет трайнуть

In [2]:
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns if needed

In [3]:
DATA_PATH = os.path.join("datasets", "Sber_housing_kaggle")
data_files = ["train.csv", "test.csv"]

In [4]:
def color_cells(val):
    """ val: значение признака """

    if val == 'float64':
        color = 'red'
    elif val == 'int64':
        color = 'red'
    else:
        color = 'blue'
    return f'color: {color}'

def valeraInfo(df):
    info = pd.DataFrame()
    info.index = df.columns
    info['Тип данных'] = df.dtypes
    info['Количесвто уникальных'] = df.nunique()
    info['Количество пропусков'] = df.isna().sum()
    info['Количество значений'] = df.count()
    info['%значений'] = round((df.count()/ df.shape[0]) * 100,2)
    info = info.style.applymap(color_cells, subset=['Тип данных'])
    return info

In [5]:
class DataFrameWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, transformer, columns):
        self.transformer = transformer
        self.columns = columns

    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        return self

    def transform(self, X):
        X_transformed = self.transformer.transform(X)
        return pd.DataFrame(X_transformed, columns=self.columns)

In [6]:
load_data = lambda path, files_names: (
    [pd.read_csv(os.path.join(path, file)) for file in files_names]
)

In [7]:
[Data_train, Data_test] = load_data(DATA_PATH, data_files)

In [8]:
# Code block
boolean_category_columns = [
    'culture_objects_top_25',
    'thermal_power_plant_raion',
    'incineration_raion',
    'oil_chemistry_raion',
    'radiation_raion',
    'railroad_terminal_raion',
    'big_market_raion',
    'nuclear_reactor_raion',
    'detention_facility_raion',
    'water_1line',
    'big_road1_1line',
    'railroad_1line',
]
binary_category_column = 'product_type'
multi_category_columns = [
    'sub_area',
    'ecology',
]
date_сolumn = 'timestamp'
sub_area_column = 'sub_area'
ecology_column = 'ecology'
target_column = 'price_doc'

In [9]:
X = Data_train.drop(columns=[target_column])
y = Data_train[target_column]

In [10]:
valeraInfo(X)

  info = info.style.applymap(color_cells, subset=['Тип данных'])


Unnamed: 0,Тип данных,Количесвто уникальных,Количество пропусков,Количество значений,%значений
id,int64,30471,0,30471,100.0
timestamp,object,1161,0,30471,100.0
full_sq,int64,211,0,30471,100.0
life_sq,float64,175,6383,24088,79.05
floor,float64,41,167,30304,99.45
max_floor,float64,49,9572,20899,68.59
material,float64,6,9572,20899,68.59
build_year,float64,119,13605,16866,55.35
num_room,float64,13,9572,20899,68.59
kitch_sq,float64,74,9572,20899,68.59


In [11]:
# X['ecology'] = X['ecology'].replace('no data', np.nan)
print(X['ecology'].unique())

['good' 'excellent' 'poor' 'satisfactory' 'no data']


In [12]:
date_column = 'timestamp'
year_column = 'year'

X = X.copy()
X[date_column] = pd.to_datetime(X[date_column], errors='coerce')
X[year_column] = X[date_column].dt.year
X = X.drop(columns=[date_column])

In [13]:
categorical_columns = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_columns = X.select_dtypes(include=['number']).columns.tolist()

print(len(X.columns))
print(len(numerical_columns))
print(len(categorical_columns))

291
276
15


In [14]:
numeric_cols_with_missing_values = [col for col in numerical_columns if X[col].isna().any()]
print(len(numeric_cols_with_missing_values))

51


In [15]:
# Code block
binary_categories_columns = [binary_category_column] + boolean_category_columns
categorical_columns = [ecology_column] + binary_categories_columns + [sub_area_column]
all_columns = categorical_columns + numerical_columns
ecology_index = all_columns.index(ecology_column)

In [16]:
class KNNModeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
    
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.values
        self.knn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.knn.fit(X[~np.isnan(X).any(axis=1)])
        return self
    
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        col_data = X[:, 0].reshape(-1, 1)
        nan_mask = np.isnan(col_data)
        
        # Ensure there are NaNs to be imputed
        if not nan_mask.any():
            return X
        
        # Find nearest neighbors for rows without NaN values
        distances, neighbors = self.knn.kneighbors(col_data[~nan_mask].reshape(-1, 1))
        
        for i, idx in enumerate(np.where(nan_mask)[0]):
            neighbor_vals = col_data[neighbors[i]].flatten()
            mode_result = mode(neighbor_vals, nan_policy='omit')
            count_value = mode_result.count if np.isscalar(mode_result.count) else mode_result.count[0]
            imputed_value = mode_result.mode if np.isscalar(mode_result.mode) else mode_result.mode[0]
            col_data[idx, 0] = imputed_value
        
        # Ensure column assignment back to X
        X[:, 0] = col_data.flatten()
        
        return X


In [17]:
class DataFrameConverter(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)

In [18]:
columns_for_numeric_knn_imputation = [col for col in all_columns if col != ecology_column]

preprocessing_pipeline = Pipeline([
    (
        'encoding',
        ColumnTransformer([
            (
                'ecology_ordinal', 
                OrdinalEncoder(
                    categories=[['poor', 'satisfactory', 'good', 'excellent']], 
                    handle_unknown='use_encoded_value', 
                    unknown_value=np.nan
                ), 
                [ecology_column]
            ),
            (
                'binary_ohe',
                OneHotEncoder(sparse_output=False, drop='if_binary'),
                binary_categories_columns
            ),
            (
                'sub_area_target',
                TargetEncoder(),
                [sub_area_column]
            ),
            (
                'numerical',
                'passthrough',
                numerical_columns
            )
        ])
    ),
    ('scaling', StandardScaler()),
    ('to_dataframe_after_scaling', DataFrameConverter(columns=all_columns)),
    (
        'knn_imputation',
        ColumnTransformer([
            (
                'ecology',
                'passthrough',
                [ecology_column]
            ),
            (
                'knn_imputer',
                KNNImputer(n_neighbors=3),
                columns_for_numeric_knn_imputation
            )
        ])
    ),
    ('to_dataframe_after_knn_imputation', DataFrameConverter(columns=all_columns)),
    (
        'ecology_imputation',
        ColumnTransformer(
            transformers=[(
                'knn_mode_imputer',
                KNNModeImputer(n_neighbors=3),
                [ecology_column]
            )],
            remainder='passthrough'
        )
    )
])

In [19]:
# pca_pipeline = Pipeline([
#     ('preprocessing', preprocessing_pipeline),
#     ('pca', PCA(n_components=0.95))  # Retain 95% of the variance
# ])

# X_pca = pca_pipeline.fit_transform(X, y)

# print("Original number of features:", X.shape[1])
# print("Reduced number of features after PCA:", X_pca.shape[1])

In [20]:
# X_processed = processing_pipeline.fit_transform(X, y)
# X_processed_df = pd.DataFrame(X_processed, columns=all_columns)

In [21]:
# numeric_cols_with_missing_values = [col for col in numerical_columns if X_processed_df[col].isna().any()]
# cols_with_missing_values = [col for col in all_columns if X_processed_df[col].isna().any()]
# print("numeric columns with missing values:", len(numeric_cols_with_missing_values))
# print("all columns with missing values:", len(cols_with_missing_values))
# print("ecology column after imputation:", X_processed_df[ecology_column].unique())

In [22]:
# raise ValueError("This is a custom error message")

In [23]:
X_processed = preprocessing_pipeline.fit_transform(X, y)

In [111]:
reduced_features = ['cafe_count_5000_price_1500', 'cafe_count_2000', 'cafe_count_3000_price_2500', 'cafe_count_1500_price_high', 'cafe_count_3000', 'full_sq', 'cafe_avg_price_5000', 'church_count_5000', 'sub_area', 'cafe_count_5000_price_2500', 'num_room', 'cafe_count_2000_price_1500', 'cafe_count_3000_price_1500', 'cafe_count_2000_price_500', 'cafe_sum_2000_min_price_avg', 'sadovoe_km', 'sport_count_3000', 'mkad_km', 'office_sqm_5000', 'swim_pool_km', 'cafe_sum_3000_min_price_avg', 'cafe_count_5000_price_4000', '16_29_all', 'cafe_count_3000_price_1000', 'big_road1_1line', 'build_count_monolith', 'cafe_count_2000_na_price', 'green_zone_part', 'life_sq', 'nuclear_reactor_km', 'zd_vokzaly_avto_km', 'cafe_count_1000', 'ttk_km', 'market_count_3000', 'ID_railroad_station_walk', 'state', 'green_part_1500', 'healthcare_centers_raion', 'culture_objects_top_25', 'cafe_count_3000_price_high', 'cafe_count_3000_price_4000', 'ID_railroad_station_avto']
reduced_features_ordered = [col for col in all_columns if col in reduced_features]

In [131]:
# feature_columns = reduced_features_ordered
feature_columns = all_columns

In [125]:
# Convert X_processed to a DataFrame if it's in NumPy format
X_processed_df = pd.DataFrame(X_processed, columns=all_columns)  # `all_columns` should be your original list of column names

# Filter the DataFrame to include only the reduced set of features
X_reduced_df = X_processed_df[feature_columns]

# If you need it back as a NumPy array, convert it here
X_reduced = X_reduced_df.to_numpy()

In [126]:
# X_reduced_df.columns

In [127]:
# X_split = X_reduced
X_split = X_reduced
print(X_split.shape[1])

291


In [128]:
X_train, X_test, y_train, y_test = train_test_split(X_split, y, test_size=0.2, random_state=42)

In [129]:
model = XGBRegressor(
    n_estimators=100,        # Number of trees
    learning_rate=0.1,       # Step size shrinkage used to prevent overfitting
    max_depth=6,             # Maximum tree depth
    subsample=0.8,           # Subsample ratio of the training instances
    colsample_bytree=0.8,    # Subsample ratio of columns for each split
    random_state=42
)

In [118]:
# Perform cross-validation
cv_predictions = cross_val_predict(model, X_split, y, cv=5)  # 5-fold cross-validation

# Calculate metrics using cross-validated predictions
mse = mean_squared_error(y, cv_predictions)
mae = mean_absolute_error(y, cv_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y, cv_predictions)

print(f"Cross-Validated Mean Absolute Error (MAE): {mae}")
print(f"Cross-Validated Mean Squared Error (MSE): {mse}")
print(f"Cross-Validated Root Mean Squared Error (RMSE): {rmse}")
print(f"Cross-Validated R^2 Score: {r2}")

Cross-Validated Mean Absolute Error (MAE): 1519687.0579198254
Cross-Validated Mean Squared Error (MSE): 7824657036427.608
Cross-Validated Root Mean Squared Error (RMSE): 2797258.8433013503
Cross-Validated R^2 Score: 0.6575449705123901


In [130]:
# Assuming X_test is preprocessed and ready for predictions
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate Mean Squared Logarithmic Error
msle = mean_squared_log_error(y_test, y_pred)

print(f"Mean Squared Logarithmic Error (MSLE): {msle}")

Mean Squared Logarithmic Error (MSLE): 0.21404743580672725


In [99]:
model_fi = XGBRegressor(
    n_estimators=100,        # Number of trees
    learning_rate=0.1,       # Step size shrinkage used to prevent overfitting
    max_depth=6,             # Maximum tree depth
    subsample=0.8,           # Subsample ratio of the training instances
    colsample_bytree=0.8,    # Subsample ratio of columns for each split
    random_state=42
)

In [100]:
print(len(X_test))
print(len(y_test))
print(len(X_split))
print(len(y))

6095
6095
30471
30471


In [101]:
model_fi.fit(X_train, y_train)
importances = model_fi.feature_importances_
len(importances)

219

In [107]:
feature_importance_df = pd.DataFrame({
    'Feature': feature_columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df.index = range(1, len(feature_importance_df) + 1)
feature_importance_df.index.name = 'Rank'

print(feature_importance_df)

                                   Feature  Importance
Rank                                                  
1               cafe_count_5000_price_1500    0.105109
2                          cafe_count_2000    0.034057
3               cafe_count_3000_price_2500    0.031771
4               cafe_count_1500_price_high    0.029350
5                          cafe_count_3000    0.028914
6                                  full_sq    0.025328
7                      cafe_avg_price_5000    0.024954
8                        church_count_5000    0.024463
9                                 sub_area    0.020938
10              cafe_count_5000_price_2500    0.020238
11                                num_room    0.015606
12              cafe_count_2000_price_1500    0.013970
13              cafe_count_3000_price_1500    0.012627
14               cafe_count_2000_price_500    0.011786
15             cafe_sum_2000_min_price_avg    0.011471
16                              sadovoe_km    0.011447
17        

In [None]:
# raise ValueError("This is a custom error message")

In [109]:
importance_threshold = 0.005
features_all = feature_importance_df['Feature'].tolist()
features_to_keep = feature_importance_df[feature_importance_df['Importance'] >= importance_threshold]['Feature'].tolist()
print(len(features_to_keep))
print(len(features_all))

42
219


In [110]:
# Check features that are being kept and dropped based on the threshold
features_below_threshold = feature_importance_df[feature_importance_df['Importance'] < importance_threshold]
features_above_threshold = feature_importance_df[feature_importance_df['Importance'] >= importance_threshold]

print("Number of features below threshold:", len(features_below_threshold))
print("Number of features above threshold:", len(features_above_threshold))

# Display the list of features below the threshold to inspect
# print("Features below the threshold of importance:")
# print(features_below_threshold[['Feature', 'Importance']].sort_values(by='Importance', ascending=True))

features_to_keep = features_above_threshold['Feature'].tolist()
print("Features to keep (for hard-coding):")
print(features_to_keep)



# To verify the exact number of features to drop
features_to_drop = features_below_threshold['Feature'].tolist()
print(f"Number of features to drop: {len(features_to_drop)}")


Number of features below threshold: 177
Number of features above threshold: 42
Features to keep (for hard-coding):
['cafe_count_5000_price_1500', 'cafe_count_2000', 'cafe_count_3000_price_2500', 'cafe_count_1500_price_high', 'cafe_count_3000', 'full_sq', 'cafe_avg_price_5000', 'church_count_5000', 'sub_area', 'cafe_count_5000_price_2500', 'num_room', 'cafe_count_2000_price_1500', 'cafe_count_3000_price_1500', 'cafe_count_2000_price_500', 'cafe_sum_2000_min_price_avg', 'sadovoe_km', 'sport_count_3000', 'mkad_km', 'office_sqm_5000', 'swim_pool_km', 'cafe_sum_3000_min_price_avg', 'cafe_count_5000_price_4000', '16_29_all', 'cafe_count_3000_price_1000', 'big_road1_1line', 'build_count_monolith', 'cafe_count_2000_na_price', 'green_zone_part', 'life_sq', 'nuclear_reactor_km', 'zd_vokzaly_avto_km', 'cafe_count_1000', 'ttk_km', 'market_count_3000', 'ID_railroad_station_walk', 'state', 'green_part_1500', 'healthcare_centers_raion', 'culture_objects_top_25', 'cafe_count_3000_price_high', 'cafe_co

In [None]:
array1 = ['office_sqm_5000', 'cafe_count_3000_price_2500', 'cafe_count_5000_price_2500', 'cafe_count_5000_price_1000', 'num_room', 'sub_area', 'full_sq', 'cafe_count_1500_price_500', 'cafe_count_5000_price_1500', 'cafe_count_3000_price_1500', 'cafe_count_2000', 'cafe_count_1500_price_2500', 'cafe_count_3000', 'big_market_raion', 'cafe_avg_price_5000', 'big_church_count_2000', 'cafe_count_2000_price_2500', 'cafe_avg_price_2000', 'build_count_monolith', 'sadovoe_km', 'life_sq', 'church_count_5000', 'healthcare_centers_raion', 'cafe_count_1500_price_high', 'cafe_count_2000_price_high', 'sport_count_3000', 'cafe_count_2000_price_1500', 'raion_build_count_with_material_info', 'leisure_count_3000', 'children_preschool', 'big_road1_1line', 'cafe_count_3000_price_1000', 'cafe_count_5000', 'cafe_count_2000_price_1000', 'state', 'big_church_count_1500', 'office_count_5000', 'cafe_count_1500_price_1500', 'shopping_centers_km', 'green_part_1500', 'swim_pool_km', 'green_part_5000', 'office_sqm_2000', 'cafe_sum_2000_min_price_avg', 'build_count_brick', 'zd_vokzaly_avto_km', 'children_school', 'cafe_count_3000_price_high', 'cafe_count_500_price_1500', 'cafe_count_5000_na_price', 'indust_part', 'incineration_km', 'market_count_1500', 'railroad_station_walk_km', 'cafe_count_2000_na_price', 'kitch_sq', 'public_transport_station_min_walk', 'mosque_km', 'theater_km', 'office_count_1500', 'railroad_km', 'cafe_count_1000_price_1500', 'church_count_2000', 'ttk_km', 'detention_facility_km', 'ID_railroad_station_walk', 'cafe_count_5000_price_4000', 'hospital_beds_raion', 'big_road1_km', 'leisure_count_5000', 'office_count_500', 'market_count_5000', 'fitness_km', 'radiation_km', 'full_all', 'cafe_count_1000', 'green_zone_part', 'cafe_avg_price_3000', 'cafe_count_1500_price_4000', 'cafe_sum_5000_max_price_avg', 'kremlin_km', 'green_part_500', 'green_part_1000', 'sport_count_500', 'cafe_count_5000_price_high', '0_17_female', 'build_count_panel', 'cafe_count_1000_price_500', 'office_sqm_3000', 'railroad_station_walk_min', 'build_count_1921-1945', 'school_quota', 'power_transmission_line_km', 'metro_km_avto', 'prom_part_3000', 'prom_part_5000', 'green_zone_km', 'ID_big_road1', 'trc_count_3000', 'office_sqm_1500', 'cafe_count_1500', 'public_transport_station_km', 'cemetery_km', 'cafe_count_500_price_high', 'ID_metro', 'cafe_count_500', 'year', 'trc_sqm_3000', 'cafe_count_1000_price_1000', 'trc_sqm_2000', 'culture_objects_top_25_raion', 'mkad_km', 'nuclear_reactor_km', 'office_count_2000', 'industrial_km', 'basketball_km', 'cafe_sum_3000_max_price_avg', 'cafe_sum_1500_min_price_avg', 'additional_education_km', 'preschool_quota', 'cafe_count_5000_price_500', 'cafe_sum_2000_max_price_avg', 'young_all', 'raion_popul', 'big_church_count_500', 'ekder_all', 'build_count_wood', 'young_male', 'workplaces_km', 'sport_count_2000', 'cafe_count_500_price_4000', 'cafe_count_2000_price_4000', 'railroad_station_avto_min', 'thermal_power_plant_km', 'cafe_count_3000_price_4000', 'office_count_1000', 'ts_km', 'catering_km', 'build_count_slag', 'ID_bus_terminal', 'cafe_avg_price_1500', 'green_part_3000', 'oil_chemistry_km', 'office_count_3000', 'stadium_km', 'water_km', 'prom_part_1500', 'build_year', 'build_count_block', 'id', 'exhibition_km', 'university_km', 'office_sqm_500', 'museum_km', 'work_all', 'build_count_before_1920', 'cafe_count_1000_price_2500', 'leisure_count_1000', 'build_count_frame', 'cafe_count_2000_price_500', 'cafe_count_500_na_price', 'cafe_count_500_price_2500', 'cafe_count_500_price_500', 'cafe_count_3000_na_price', 'big_church_km', 'shopping_centers_raion', 'ekder_male', 'max_floor', 'build_count_1971-1995', 'build_count_foam', 'water_treatment_km', 'build_count_1946-1970', 'green_part_2000', 'product_type', 'church_count_1500', 'cafe_count_1000_price_4000', 'trc_count_1500', 'market_count_2000', 'trc_count_1000', 'floor', 'big_market_km', 'cafe_sum_3000_min_price_avg', 'park_km', 'cafe_count_1500_price_1000', 'kindergarten_km', 'preschool_km', 'cafe_sum_1000_max_price_avg', 'cafe_sum_1500_max_price_avg', 'cafe_sum_500_max_price_avg', 'market_shop_km', 'public_healthcare_km', 'prom_part_500', 'hospice_morgue_km', 'big_church_count_1000', 'ID_railroad_terminal', 'metro_min_walk', 'sport_count_1500', 'cafe_count_3000_price_500', 'leisure_count_2000', 'office_sqm_1000', 'material', 'trc_sqm_5000', 'raion_build_count_with_builddate_info', 'area_m', 'trc_count_5000', 'work_male', 'office_km', 'office_raion', 'metro_min_avto', 'big_road2_km', 'prom_part_2000', 'bus_terminal_avto_km', 'trc_sqm_500', 'mosque_count_5000', 'sport_objects_raion', 'church_synagogue_km', 'big_church_count_5000', 'ice_rink_km', 'trc_count_500', 'sport_count_1000', 'church_count_1000', 'railroad_station_avto_km', 'preschool_education_centers_raion', 'female_f']
array2 = ['office_sqm_5000', 'cafe_count_5000_price_1500', 'cafe_count_5000_price_2500', 'cafe_count_3000_price_1500', 'cafe_count_3000_price_2500', 'num_room', 'cafe_count_1500_price_2500', 'sub_area', 'full_sq', 'cafe_count_3000', 'church_count_5000', 'cafe_count_2000', 'cafe_count_3000_price_high', 'big_church_count_2000', 'cafe_avg_price_5000', 'cafe_avg_price_2000', 'life_sq', 'sport_count_3000', 'cafe_count_1500_price_1500', 'sadovoe_km', 'cafe_count_3000_price_1000', 'swim_pool_km', 'build_count_monolith', 'cafe_count_2000_price_2500', 'cafe_count_1500_price_high', 'cafe_count_2000_price_1000', 'indust_part', 'state', 'cafe_count_2000_price_high', 'ttk_km', 'leisure_count_1500', 'healthcare_centers_raion', 'cafe_count_3000_price_4000', 'trc_count_5000', 'zd_vokzaly_avto_km', 'cafe_count_1000_price_1000', 'cafe_count_2000_price_1500', 'office_count_500', 'trc_count_2000', 'leisure_count_3000', 'green_part_5000', 'cafe_sum_1500_min_price_avg', 'preschool_quota', 'railroad_station_walk_min', 'railroad_station_walk_km', 'public_transport_station_min_walk', 'cafe_count_1000_price_high', 'young_all', 'cafe_count_1000', 'green_part_1000', 'culture_objects_top_25', 'office_raion', 'kitch_sq', 'raion_build_count_with_material_info', 'ekder_all', 'cafe_count_5000_na_price', 'cafe_count_5000_price_high', 'green_part_1500', 'industrial_km', 'railroad_km', 'cafe_sum_3000_max_price_avg', 'prom_part_3000', 'trc_count_1500', 'ID_railroad_station_walk', 'ts_km', '7_14_female', 'office_count_1500', 'ekder_male', 'green_zone_part', 'cafe_count_2000_price_500', 'ID_bus_terminal', 'cafe_count_1500_price_4000', 'cafe_count_2000_price_4000', 'children_preschool', 'ekder_female', 'cemetery_km', 'trc_count_3000', 'cafe_count_5000_price_1000', 'cafe_count_5000', 'mosque_km', 'leisure_count_2000', 'incineration_km', 'radiation_km', 'hospital_beds_raion', 'ID_big_road1', 'cafe_count_5000_price_4000', '16_29_all', 'prom_part_5000', 'big_road1_1line', 'shopping_centers_km', 'market_count_3000', 'public_transport_station_km', 'big_market_raion', 'additional_education_km', 'market_count_500', 'fitness_km', 'build_count_before_1920', 'young_male', 'big_road2_km', 'young_female', 'cafe_sum_1000_max_price_avg', 'ID_metro', 'office_count_3000', 'sport_count_1500', 'office_sqm_500', 'bulvar_ring_km', 'mkad_km', 'cafe_sum_2000_max_price_avg', 'school_quota', 'build_count_panel', 'metro_km_avto', 'catering_km', 'museum_km', 'cafe_count_1000_price_1500', 'cafe_sum_2000_min_price_avg', 'public_healthcare_km', 'office_sqm_3000', 'build_year', 'exhibition_km', 'nuclear_reactor_km', 'cafe_count_1000_price_2500', 'trc_sqm_2000', 'power_transmission_line_km', 'big_market_km', 'cafe_sum_1500_max_price_avg', 'year', 'water_km', 'university_km', 'build_count_block', 'max_floor', 'trc_sqm_5000', 'cafe_sum_500_min_price_avg', 'build_count_1971-1995', 'theater_km', 'park_km', 'build_count_1921-1945', 'cafe_count_500_price_1000', 'detention_facility_km', 'kremlin_km', 'sport_objects_raion', 'id', 'cafe_count_1000_price_500', 'oil_chemistry_km', 'water_treatment_km', '0_17_female', 'cafe_count_500_na_price', 'green_zone_km', 'basketball_km', 'prom_part_1500', 'cafe_avg_price_3000', 'cafe_count_500', 'cafe_avg_price_500', 'cafe_count_1500_na_price', 'kindergarten_km', 'green_part_3000', 'build_count_wood', 'big_church_km', 'sport_count_1000', 'trc_sqm_500', 'sport_count_5000', 'build_count_foam', 'ID_railroad_station_avto', 'cafe_sum_3000_min_price_avg', 'office_count_2000', 'build_count_1946-1970', 'trc_count_1000', 'thermal_power_plant_km', '16_29_male', 'cafe_count_2000_na_price', 'raion_popul', 'church_count_1500', 'metro_min_walk', 'trc_sqm_3000', 'stadium_km', 'prom_part_1000', 'cafe_count_500_price_500', 'school_km', 'female_f', 'floor', 'cafe_count_500_price_1500', 'office_km', 'cafe_count_1000_price_4000', 'railroad_station_avto_min', 'cafe_count_3000_na_price', 'leisure_count_1000', 'office_count_1000', 'ID_big_road2', 'metro_min_avto', 'build_count_brick', 'big_church_count_500', 'cafe_sum_5000_max_price_avg', 'green_part_500', 'trc_sqm_1000', 'workplaces_km', 'bus_terminal_avto_km', 'cafe_count_1500', 'big_church_count_5000', 'school_education_centers_raion', '0_6_male', 'product_type', 'cafe_sum_1000_min_price_avg', 'green_part_2000', 'ID_railroad_terminal', 'cafe_count_1500_price_500', 'office_count_5000', 'railroad_station_avto_km', 'cafe_count_500_price_4000', 'sport_count_500', 'big_road1_km', 'office_sqm_1500', 'prom_part_2000', 'market_shop_km', 'cafe_sum_5000_min_price_avg', 'office_sqm_2000', 'culture_objects_top_25_raion', 'cafe_count_1000_na_price', 'church_synagogue_km', 'preschool_km', 'ice_rink_km']

common_elements = list(set(array1) & set(array2))
print("Common elements:", len(common_elements))

In [137]:
count = (X['life_sq'] > X['full_sq']).sum()
print(count)

# Select only numeric columns from X
numeric_X = X.select_dtypes(include=[np.number])

# Count the number of np.inf and -np.inf values
inf_count = np.isinf(numeric_X).sum().sum()
neg_inf_count = np.isneginf(numeric_X).sum().sum()

print("Number of np.inf values:", inf_count)
print("Number of -np.inf values:", neg_inf_count)

37
Number of np.inf values: 0
Number of -np.inf values: 0


In [None]:
# Create the histogram
plt.figure(figsize=(25, 15))  # Adjust the size as needed
plt.hist(X_train["price_doc"], bins=500)

# Get the current axes object
ax = plt.gca()

# Use a logarithmic scale for the x-axis
ax.set_xscale('log')

# Define specific tick positions for custom intervals
# Ticks from 1M to 20M with steps of 1M, and from 20M to 100M with steps of 10M
ticks = np.concatenate([np.arange(1e6, 2.1e7, 1e6), np.arange(2e7, 1.1e8, 1e7)])

# Apply the tick positions to the x-axis
ax.set_xticks(ticks)

# Define a custom formatter to display x-axis labels in millions
ax.set_xticklabels([f'{int(tick / 1_000_000)}M' for tick in ticks])

# Rotate x-axis labels to 90 degrees for better readability
plt.xticks(rotation=90)

# Set the x-axis limits to start at 1 million
ax.set_xlim(left=1e6)

# Set labels and title
plt.xlabel('Price (in millions)')
plt.ylabel('Frequency')
plt.title('Histogram with Focused Logarithmic X-axis')

# Display the plot
plt.show()


In [None]:
corr_matrix = X_train.select_dtypes(include=[np.number]).corr()

In [None]:
corr_matrix

In [None]:
plt.figure(figsize=(10, 10))  # Adjust the figure size as needed
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(corr_matrix)


In [None]:
pd.set_option('display.max_info_columns', 300)
non_numeric_cols = X_train.select_dtypes(exclude=[np.number])
non_numeric_cols.info()