ALESHA try

In [None]:
import psycopg2
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
import xgboost as xgb
from google.colab import userdata
from sklearn.decomposition import PCA

# Загрузка данных
def load_data():
    db_host = userdata.get('db_host')
    db_port = userdata.get('db_port')
    db_name = userdata.get('db_name')
    db_user = userdata.get('db_user')
    db_password = userdata.get('db_password')

    if db_port and isinstance(db_port, str):
        db_port = int(db_port)

    connection = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )

    tables = ["addresses", "developers", "offers", "offers_details",
              "realty_details", "realty_inside", "realty_outside"]

    dfs = {}
    for table in tables:
        query = f"SELECT * FROM public.{table};"
        dfs[table] = pd.read_sql(query, connection)

    connection.close()

    return dfs

# Функция очистки данных
def clean_data(dfs):
    dfs['addresses']['lat'] = dfs['addresses']['coordinates'].apply(lambda x: x['lat'] if isinstance(x, dict) else None)
    dfs['addresses']['lng'] = dfs['addresses']['coordinates'].apply(lambda x: x['lng'] if isinstance(x, dict) else None)
    dfs['addresses'].drop(columns=['id', 'coordinates', 'address', 'created_at', 'updated_at'], inplace=True)
    dfs['developers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['offers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['offers_details'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_details']['finish_year'] = dfs['realty_details']['finish_date'].apply(lambda x: x.get('year') if isinstance(x, dict) else None)
    dfs['realty_details'].drop(columns=['id', 'finish_date', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_inside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_outside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

    main_df = dfs['addresses'].merge(dfs['offers'], on='cian_id', how='inner') \
                              .merge(dfs['offers_details'], on='cian_id', how='inner')

    tables_to_left_join = ['developers', 'realty_details', 'realty_inside', 'realty_outside']
    for table in tables_to_left_join:
        main_df = main_df.merge(dfs[table], on='cian_id', how='left')

    return main_df

# Функция очистки выбросов (метод IQR 3 сигмы)
def remove_outliers_soft(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Функция обработки пропущенных значений
def handle_missing_values(df):
    df.fillna({
        'loggias': 0, 'separated_wc': 0, 'balconies': 0, 'combined_wc': 0, 'passenger_lifts': 0,
        'total_rate': df['total_rate'].mean(),
        'review_count': df['review_count'].mean(),
        'ceiling_height': df['ceiling_height'].mean(),
        'build_year': df['finish_year']
    }, inplace=True)

    mean_proportion_living_area = (df['living_area'] / df['total_area']).mean()
    df['living_area'].fillna(df['total_area'] * mean_proportion_living_area, inplace=True)

    mask = (df['total_area'] - df['living_area']) != 0
    mean_proportion_kitchen_area = (df.loc[mask, 'kitchen_area'] / (df.loc[mask, 'total_area'] - df.loc[mask, 'living_area'])).mean()
    df['kitchen_area'].fillna((df['total_area'] - df['living_area']) * mean_proportion_kitchen_area, inplace=True)

    df.drop(columns=['finish_year'], inplace=True)

    return df

# Функция кодирования категориальных переменных
def encode_features(df):
    onehot_columns = ['county', 'flat_type', 'sale_type', 'category', 'material_type', 'travel_type']
    target_columns = ['district', 'project_type', 'metro']
    ordinal_columns = {'repair_type': {'no': 0, 'cosmetic': 1, 'euro': 2, 'design': 3}}

    onehot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
    df_onehot = pd.DataFrame(onehot_encoder.fit_transform(df[onehot_columns]), columns=onehot_encoder.get_feature_names_out(onehot_columns))

    for col, mapping in ordinal_columns.items():
        df[col] = df[col].map(mapping)

    target_encoder = TargetEncoder()
    df_target = target_encoder.fit_transform(df[target_columns], df['price'])

    df.drop(columns=onehot_columns + target_columns, inplace=True)
    df = pd.concat([df, df_onehot, df_target], axis=1)

    return df

# Функция удаления нечисловых колонок
def remove_non_numeric(df):
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    df = df.drop(columns=non_numeric_columns)
    return df

# Функция обработки данных
def process_data():
    dfs = load_data()
    df = clean_data(dfs)

    columns_to_clean = ['price', 'total_area', 'living_area', 'kitchen_area', 'rooms_count']
    for column in columns_to_clean:
        df = remove_outliers_soft(df, column)

    df = handle_missing_values(df)
    df = encode_features(df)
    df = remove_non_numeric(df)

    df = df.dropna(subset=['price'])

    X = df.drop(columns=['price'])
    y = df['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train.fillna(X_train.mean(), inplace=True)
    X_test.fillna(X_test.mean(), inplace=True)

    return X_train, X_test, y_train, y_test

def reduce_dimensionality(X_train, X_test, n_components=0.95):

    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    print(f"Исходное количество признаков: {X_train.shape[1]}")
    print(f"После PCA осталось признаков: {X_train_pca.shape[1]}")

    return X_train_pca, X_test_pca

# Запуск обработки данных
X_train, X_test, y_train, y_test = process_data()

# Снижение размерности
X_train, X_test = reduce_dimensionality(X_train, X_test, n_components=0.95)

# Обучение модели XGBoost
model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

# Прогнозирование
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Оценка модели
print(f'''Metrics_train:
MSE = {mean_squared_error(y_train, train_pred):,.2f}
R2  = {r2_score(y_train, train_pred):,.4f}
MAE = {mean_absolute_error(y_train, train_pred):,.2f}
MAPE = {mean_absolute_percentage_error(y_train, train_pred):,.4f}
''')

print(f'''Metrics_test:
MSE = {mean_squared_error(y_test, test_pred):,.2f}
R2  = {r2_score(y_test, test_pred):,.4f}
MAE = {mean_absolute_error(y_test, test_pred):,.2f}
MAPE = {mean_absolute_percentage_error(y_test, test_pred):,.4f}
''')

  dfs[table] = pd.read_sql(query, connection)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['living_area'].fillna(df['total_area'] * mean_proportion_living_area, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['kitchen_area'].fillna((df['total_area'] - df['living_area']) * mean_proportion_kitchen_area, in

Исходное количество признаков: 62
После PCA осталось признаков: 3
Metrics_train:
MSE = 101,707,035,820,999.14
R2  = 0.6166
MAE = 6,596,101.66
MAPE = 0.3423

Metrics_test:
MSE = 118,098,647,630,129.42
R2  = 0.5590
MAE = 7,030,139.52
MAPE = 0.3521



Без PCA(BestTry)

In [None]:
import psycopg2
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
import xgboost as xgb
from google.colab import userdata

# Загрузка данных
def load_data():
    db_host = userdata.get('db_host')
    db_port = userdata.get('db_port')
    db_name = userdata.get('db_name')
    db_user = userdata.get('db_user')
    db_password = userdata.get('db_password')

    if db_port and isinstance(db_port, str):
        db_port = int(db_port)

    connection = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )

    tables = ["addresses", "developers", "offers", "offers_details",
              "realty_details", "realty_inside", "realty_outside"]

    dfs = {}
    for table in tables:
        query = f"SELECT * FROM public.{table};"
        dfs[table] = pd.read_sql(query, connection)

    connection.close()

    return dfs

# Функция очистки данных
def clean_data(dfs):
    dfs['addresses']['lat'] = dfs['addresses']['coordinates'].apply(lambda x: x['lat'] if isinstance(x, dict) else None)
    dfs['addresses']['lng'] = dfs['addresses']['coordinates'].apply(lambda x: x['lng'] if isinstance(x, dict) else None)
    dfs['addresses'].drop(columns=['id', 'coordinates', 'address', 'created_at', 'updated_at'], inplace=True)
    dfs['developers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['offers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['offers_details'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_details']['finish_year'] = dfs['realty_details']['finish_date'].apply(lambda x: x.get('year') if isinstance(x, dict) else None)
    dfs['realty_details'].drop(columns=['id', 'finish_date', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_inside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_outside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

    main_df = dfs['addresses'].merge(dfs['offers'], on='cian_id', how='inner') \
                              .merge(dfs['offers_details'], on='cian_id', how='inner')

    tables_to_left_join = ['developers', 'realty_details', 'realty_inside', 'realty_outside']
    for table in tables_to_left_join:
        main_df = main_df.merge(dfs[table], on='cian_id', how='left')

    return main_df

# Функция очистки выбросов
def remove_outliers_soft(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Функция обработки пропущенных значений
def handle_missing_values(df):
    df.fillna({
        'loggias': 0, 'separated_wc': 0, 'balconies': 0, 'combined_wc': 0, 'passenger_lifts': 0,
        'total_rate': df['total_rate'].mean(),
        'review_count': df['review_count'].mean(),
        'ceiling_height': df['ceiling_height'].mean(),
        'build_year': df['finish_year']
    }, inplace=True)

    mean_proportion_living_area = (df['living_area'] / df['total_area']).mean()
    df['living_area'].fillna(df['total_area'] * mean_proportion_living_area, inplace=True)

    mask = (df['total_area'] - df['living_area']) != 0
    mean_proportion_kitchen_area = (df.loc[mask, 'kitchen_area'] / (df.loc[mask, 'total_area'] - df.loc[mask, 'living_area'])).mean()
    df['kitchen_area'].fillna((df['total_area'] - df['living_area']) * mean_proportion_kitchen_area, inplace=True)

    df.drop(columns=['finish_year'], inplace=True)

    return df

# Функция кодирования категориальных переменных
def encode_features(df):
    onehot_columns = ['county', 'flat_type', 'sale_type', 'category', 'material_type', 'travel_type']
    target_columns = ['district', 'project_type', 'metro']
    ordinal_columns = {'repair_type': {'no': 0, 'cosmetic': 1, 'euro': 2, 'design': 3}}

    onehot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
    df_onehot = pd.DataFrame(onehot_encoder.fit_transform(df[onehot_columns]), columns=onehot_encoder.get_feature_names_out(onehot_columns))

    for col, mapping in ordinal_columns.items():
        df[col] = df[col].map(mapping)

    target_encoder = TargetEncoder()
    df_target = target_encoder.fit_transform(df[target_columns], df['price'])

    df.drop(columns=onehot_columns + target_columns, inplace=True)
    df = pd.concat([df, df_onehot, df_target], axis=1)

    return df

# Функция удаления нечисловых колонок
def remove_non_numeric(df):
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    df = df.drop(columns=non_numeric_columns)
    return df

# Функция обработки данных
def process_data():
    dfs = load_data()
    df = clean_data(dfs)

    columns_to_clean = ['price', 'total_area', 'living_area', 'kitchen_area', 'rooms_count']
    for column in columns_to_clean:
        df = remove_outliers_soft(df, column)

    df = handle_missing_values(df)
    df = encode_features(df)
    df = remove_non_numeric(df)

    df = df.dropna(subset=['price'])

    X = df.drop(columns=['price'])
    y = df['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train.fillna(X_train.mean(), inplace=True)
    X_test.fillna(X_test.mean(), inplace=True)

    return X_train, X_test, y_train, y_test

# Запуск обработки данных
X_train, X_test, y_train, y_test = process_data()

# Обучение модели XGBoost
model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

# Прогнозирование
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Оценка модели
print(f'''Metrics_train:
MSE = {mean_squared_error(y_train, train_pred):,.2f}
R2  = {r2_score(y_train, train_pred):,.4f}
MAE = {mean_absolute_error(y_train, train_pred):,.2f}
MAPE = {mean_absolute_percentage_error(y_train, train_pred):,.4f}
''')

print(f'''Metrics_test:
MSE = {mean_squared_error(y_test, test_pred):,.2f}
R2  = {r2_score(y_test, test_pred):,.4f}
MAE = {mean_absolute_error(y_test, test_pred):,.2f}
MAPE = {mean_absolute_percentage_error(y_test, test_pred):,.4f}
''')


  dfs[table] = pd.read_sql(query, connection)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['living_area'].fillna(df['total_area'] * mean_proportion_living_area, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['kitchen_area'].fillna((df['total_area'] - df['living_area']) * mean_proportion_kitchen_area, in

Metrics_train:
MSE = 10,049,130,785,573.83
R2  = 0.9621
MAE = 1,985,901.11
MAPE = 0.1115

Metrics_test:
MSE = 18,647,666,726,402.01
R2  = 0.9303
MAE = 2,493,486.56
MAPE = 0.1262



ГОШАН,
напишешь что сделали 2 pipeline, с PCA упало качество модели, по этому мы использовали XGBoost без PCA, напиши про метрики

In [None]:
import psycopg2
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor

# Загрузка данных
def load_data():
    db_host = userdata.get('db_host')
    db_port = userdata.get('db_port')
    db_name = userdata.get('db_name')
    db_user = userdata.get('db_user')
    db_password = userdata.get('db_password')

    if db_port and isinstance(db_port, str):
        db_port = int(db_port)

    connection = psycopg2.connect(
        host=db_host,
        port=db_port,
        database=db_name,
        user=db_user,
        password=db_password
    )

    tables = ["addresses", "developers", "offers", "offers_details",
              "realty_details", "realty_inside", "realty_outside"]

    dfs = {}
    for table in tables:
        query = f"SELECT * FROM public.{table};"
        dfs[table] = pd.read_sql(query, connection)

    connection.close()

    return dfs

# Функция очистки данных
def clean_data(dfs):
    dfs['addresses']['lat'] = dfs['addresses']['coordinates'].apply(lambda x: x['lat'] if isinstance(x, dict) else None)
    dfs['addresses']['lng'] = dfs['addresses']['coordinates'].apply(lambda x: x['lng'] if isinstance(x, dict) else None)
    dfs['addresses'].drop(columns=['id', 'coordinates', 'address', 'created_at', 'updated_at'], inplace=True)
    dfs['developers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['offers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['offers_details'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_details']['finish_year'] = dfs['realty_details']['finish_date'].apply(lambda x: x.get('year') if isinstance(x, dict) else None)
    dfs['realty_details'].drop(columns=['id', 'finish_date', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_inside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)
    dfs['realty_outside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

    main_df = dfs['addresses'].merge(dfs['offers'], on='cian_id', how='inner') \
                              .merge(dfs['offers_details'], on='cian_id', how='inner')

    tables_to_left_join = ['developers', 'realty_details', 'realty_inside', 'realty_outside']
    for table in tables_to_left_join:
        main_df = main_df.merge(dfs[table], on='cian_id', how='left')

    return main_df

# Функция очистки выбросов
def remove_outliers_soft(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Функция обработки пропущенных значений
def handle_missing_values(df):
    df.fillna({
        'loggias': 0, 'separated_wc': 0, 'balconies': 0, 'combined_wc': 0, 'passenger_lifts': 0,
        'total_rate': df['total_rate'].mean(),
        'review_count': df['review_count'].mean(),
        'ceiling_height': df['ceiling_height'].mean(),
        'build_year': df['finish_year']
    }, inplace=True)

    mean_proportion_living_area = (df['living_area'] / df['total_area']).mean()
    df['living_area'].fillna(df['total_area'] * mean_proportion_living_area, inplace=True)

    mask = (df['total_area'] - df['living_area']) != 0
    mean_proportion_kitchen_area = (df.loc[mask, 'kitchen_area'] / (df.loc[mask, 'total_area'] - df.loc[mask, 'living_area'])).mean()
    df['kitchen_area'].fillna((df['total_area'] - df['living_area']) * mean_proportion_kitchen_area, inplace=True)

    df.drop(columns=['finish_year'], inplace=True)

    return df

# Функция кодирования категориальных переменных
def encode_features(df):
    onehot_columns = ['county', 'flat_type', 'sale_type', 'category', 'material_type', 'travel_type']
    target_columns = ['district', 'project_type', 'metro']
    ordinal_columns = {'repair_type': {'no': 0, 'cosmetic': 1, 'euro': 2, 'design': 3}}

    onehot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
    df_onehot = pd.DataFrame(onehot_encoder.fit_transform(df[onehot_columns]), columns=onehot_encoder.get_feature_names_out(onehot_columns))

    for col, mapping in ordinal_columns.items():
        df[col] = df[col].map(mapping)

    target_encoder = TargetEncoder()
    df_target = target_encoder.fit_transform(df[target_columns], df['price'])

    df.drop(columns=onehot_columns + target_columns, inplace=True)
    df = pd.concat([df, df_onehot, df_target], axis=1)

    return df

# Функция удаления нечисловых колонок
def remove_non_numeric(df):
    non_numeric_columns = df.select_dtypes(include=['object']).columns
    df = df.drop(columns=non_numeric_columns)
    return df

# Функция обработки данных
def process_data():
    dfs = load_data()
    df = clean_data(dfs)

    columns_to_clean = ['price', 'total_area', 'living_area', 'kitchen_area', 'rooms_count']
    for column in columns_to_clean:
        df = remove_outliers_soft(df, column)

    df = handle_missing_values(df)
    df = encode_features(df)
    df = remove_non_numeric(df)

    df = df.dropna(subset=['price'])

    X = df.drop(columns=['price'])
    y = df['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_train.fillna(X_train.mean(), inplace=True)
    X_test.fillna(X_test.mean(), inplace=True)

    return X_train, X_test, y_train, y_test

# Запуск обработки данных
X_train, X_test, y_train, y_test = process_data()

# Обучение модели Decision Tree
model = DecisionTreeRegressor(
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

model.fit(X_train, y_train)

# Прогнозирование
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

# Оценка модели
print(f'''Metrics_train:
MSE = {mean_squared_error(y_train, train_pred):,.2f}
R2  = {r2_score(y_train, train_pred):,.4f}
MAE = {mean_absolute_error(y_train, train_pred):,.2f}
MAPE = {mean_absolute_percentage_error(y_train, train_pred):,.4f}
''')

print(f'''Metrics_test:
MSE = {mean_squared_error(y_test, test_pred):,.2f}
R2  = {r2_score(y_test, test_pred):,.4f}
MAE = {mean_absolute_error(y_test, test_pred):,.2f}
MAPE = {mean_absolute_percentage_error(y_test, test_pred):,.4f}
''')

  dfs[table] = pd.read_sql(query, connection)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['living_area'].fillna(df['total_area'] * mean_proportion_living_area, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['kitchen_area'].fillna((df['total_area'] - df['living_area']) * mean_proportion_kitchen_area, in

Metrics_train:
MSE = 24,853,671,609,465.50
R2  = 0.9065
MAE = 2,997,974.26
MAPE = 0.1575

Metrics_test:
MSE = 38,735,299,387,382.66
R2  = 0.8545
MAE = 3,710,371.25
MAPE = 0.1892

