# **Подготовка данных**

In [2]:
import psycopg2
import pandas as pd
from google.colab import userdata

db_host = userdata.get('db_host')
db_port = userdata.get('db_port')
db_name = userdata.get('db_name')
db_user = userdata.get('db_user')
db_password = userdata.get('db_password')

connection = psycopg2.connect(
    host=db_host,
    port=db_port,
    database=db_name,
    user=db_user,
    password=db_password
)

tables = ["addresses", "developers", "offers", "offers_details", "realty_details", "realty_inside", "realty_outside"]

dfs = {}
for table in tables:
    query = f"SELECT * FROM public.{table};"
    dfs[table] = pd.read_sql(query, connection)

connection.close()

  dfs[table] = pd.read_sql(query, connection)


In [3]:
import random
random.seed(42)

In [4]:
# Addresses
dfs['addresses']['lat'] = dfs['addresses']['coordinates'].apply(lambda x: x['lat'] if isinstance(x, dict) else None)
dfs['addresses']['lng'] = dfs['addresses']['coordinates'].apply(lambda x: x['lng'] if isinstance(x, dict) else None)
dfs['addresses'].drop(columns=['id', 'coordinates', 'address', 'created_at', 'updated_at'], inplace=True)

In [5]:
# Developers
dfs['developers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

In [6]:
# offers
dfs['offers'] = dfs['offers'].dropna(subset=['photos_count'])
dfs['offers']['publication_at'] = pd.to_datetime(pd.to_datetime(dfs['offers']['publication_at'], unit = 's', utc = True).dt.date)
dfs['offers'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

In [7]:
# offers_details
dfs['offers_details'] = dfs['offers_details'][dfs['offers_details']['agent_name'] != "Росимущество"].reset_index(drop=True)
dfs['offers_details'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

In [8]:
# realty_details
dfs['realty_details']['finish_year'] = dfs['realty_details']['finish_date'].apply(lambda x: x.get('year') if isinstance(x, dict) else None)
# dfs['realty_details']['finish_quarter'] = dfs['realty_details']['finish_date'].apply(lambda x: x.get('quarter') if isinstance(x, dict) else None)
dfs['realty_details']['realty_type'] = dfs['realty_details']['realty_type'].replace('none', None)
dfs['realty_details']['heat_type'] = dfs['realty_details']['heat_type'].replace('none', None)
dfs['realty_details'].loc[dfs['realty_details']['finish_year'] <= 0, 'finish_year'] = None
# dfs['realty_details'].loc[dfs['realty_details']['finish_quarter'] <= 0, 'finish_quarter'] = None

dfs['realty_details'].drop(columns=['id', 'finish_date', 'created_at', 'updated_at'], inplace=True)

In [9]:
# realty_inside
dfs['realty_inside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

In [10]:
# realty_outside
dfs['realty_outside']['material_type'] = dfs['realty_outside']['material_type'].replace('none', None)
dfs['realty_outside']['material_type'] = dfs['realty_outside']['parking_type'].replace('none', None)

dfs['realty_outside'].drop(columns=['id', 'created_at', 'updated_at'], inplace=True)

In [11]:
main_df = dfs['addresses'].merge(dfs['offers'], on='cian_id', how='inner').merge(dfs['offers_details'], on='cian_id', how='inner')

tables_to_left_join = ['developers', 'realty_details', 'realty_inside', 'realty_outside']
for table in tables_to_left_join:
    main_df = main_df.merge(dfs[table], on='cian_id', how='left')

In [12]:
main_df.shape

(65762, 60)

In [13]:
numeric_df = main_df.select_dtypes(include='number')
missing_values = numeric_df.isnull().sum().sort_values(ascending=False)
missing_values

entrances          31107
cargo_lifts        30017
loggias            28934
foundation_year    28535
separated_wc       28270
review_count       28165
total_rate         28165
finish_year        28056
balconies          27668
buildings_count    27473
build_year         23121
lifts_count        18105
ceiling_height     17660
combined_wc        16479
passenger_lifts    16091
living_area        16036
kitchen_area       12273
rooms_count         9875
travel_time          821
views_count          446
floors_count           0
price                  0
photos_count           0
floor_number           0
lat                    0
cian_id                0
lng                    0
total_area             0
dtype: int64

In [14]:
import numpy as np
main_df = main_df[main_df['photos_count'] >= 0].reset_index(drop=True)

main_df['loggias'] = main_df['loggias'].replace({np.nan: 0})
main_df['separated_wc'] = main_df['separated_wc'].replace({np.nan: 0})
# main_df['finish_quarter'] = main_df['finish_quarter'].replace({np.nan: 0})
main_df['balconies'] = main_df['balconies'].replace({np.nan: 0})
main_df['combined_wc'] = main_df['combined_wc'].replace({np.nan: 0})
main_df['passenger_lifts'] = main_df['passenger_lifts'].replace({np.nan: 0})

main_df['total_rate'] = main_df['total_rate'].fillna(main_df['total_rate'].mean())
main_df['review_count'] = main_df['review_count'].fillna(main_df['review_count'].mean())
main_df['ceiling_height'] = main_df['ceiling_height'].fillna(main_df['ceiling_height'].mean())

mean_proportion_ceiling_height = (main_df['ceiling_height'] / main_df['total_area']).mean()
main_df.loc[main_df['ceiling_height'] <= 0, 'ceiling_height'] = main_df['total_area'] * mean_proportion_ceiling_height

mean_proportion_living_area = (main_df['living_area'] / main_df['total_area']).mean()
main_df['living_area'] = main_df['living_area'].fillna(main_df['total_area'] * mean_proportion_living_area)

mask = (main_df['total_area'] - main_df['living_area']) != 0
mean_proportion_kitchen_area = (main_df.loc[mask, 'kitchen_area'] / (main_df.loc[mask, 'total_area'] - main_df.loc[mask, 'living_area'])).mean()
main_df['kitchen_area'] = main_df['kitchen_area'].fillna((main_df['total_area'] - main_df['living_area']).replace(0, np.nan) * mean_proportion_kitchen_area)


mean_proportion_rooms_count = (main_df['rooms_count'] / main_df['living_area']).mean()
main_df['rooms_count'] = main_df['rooms_count'].fillna(main_df['living_area']*mean_proportion_rooms_count).astype(int)
main_df['build_year'] = main_df.apply(lambda row: row['finish_year'] if pd.isna(row['build_year']) else row['build_year'], axis=1)

main_df = main_df.dropna(subset=['travel_time', 'views_count', 'kitchen_area', 'build_year']).copy()
main_df.drop(columns=['entrances', 'cargo_lifts', 'foundation_year', 'buildings_count', 'lifts_count', 'finish_year'], inplace=True)


In [15]:
main_df.shape

(63332, 54)

In [16]:
numeric_df = main_df.select_dtypes(include='number')
missing_values = numeric_df.isnull().sum().sort_values(ascending=False)
missing_values

cian_id            0
travel_time        0
lat                0
lng                0
price              0
views_count        0
photos_count       0
floor_number       0
floors_count       0
review_count       0
total_rate         0
total_area         0
living_area        0
kitchen_area       0
ceiling_height     0
balconies          0
loggias            0
rooms_count        0
separated_wc       0
combined_wc        0
build_year         0
passenger_lifts    0
dtype: int64

In [17]:
category_df = main_df.select_dtypes(include=['object'])
missing_values2 = category_df.isnull().sum().sort_values(ascending=False)
missing_values2

is_penthouse           55538
garbage_chute          45747
project_type           34449
heat_type              30678
is_reliable            26265
name                   25761
is_emergency           24552
gas_type               24552
renovation_programm    24552
repair_type            22959
parking_type           22940
material_type          22940
windows_view           19674
street                 14466
agent_name             12762
is_apartment           11270
is_mortgage_allowed    10518
district                9559
house                   5790
sale_type                808
price_changes            165
description                0
flat_type                  0
deal_type                  0
travel_type                0
category                   0
metro                      0
county                     0
realty_type                0
dtype: int64

In [18]:
main_df['is_penthouse'] = main_df['is_penthouse'].astype(bool).fillna(False)
main_df['garbage_chute'] = main_df['garbage_chute'].astype(bool).fillna(False)
main_df['is_reliable'] = main_df['is_reliable'].astype(bool).fillna(False)
main_df['is_emergency'] = main_df['is_emergency'].astype(bool).fillna(False)
main_df['is_apartment'] = main_df['is_apartment'].astype(bool).fillna(False)
main_df['is_mortgage_allowed'] = main_df['is_mortgage_allowed'].astype(bool).fillna(False)
main_df['renovation_programm'] = main_df['renovation_programm'].astype(bool).fillna(False)

main_df['photos_count'] = main_df['photos_count'].astype(int)
main_df['price'] = main_df['price'].astype(int)
main_df['travel_time'] = main_df['travel_time'].astype(int)
main_df['views_count'] = main_df['views_count'].astype(int)
main_df['balconies'] = main_df['balconies'].astype(int)
main_df['loggias'] = main_df['loggias'].astype(int)
main_df['separated_wc'] = main_df['separated_wc'].astype(int)
main_df['combined_wc'] = main_df['combined_wc'].astype(int)
main_df['passenger_lifts'] = main_df['passenger_lifts'].astype(int)
main_df['review_count'] = main_df['review_count'].astype(int)
main_df['build_year'] = main_df['build_year'].astype(int)

In [19]:
main_df['project_type'] = main_df['project_type'].replace({np.nan: 'Индивидуальный проект'})

In [20]:
import numpy as np

# Центр Москвы
center_lat = 55.753600
center_lng = 37.621184

# Радиус Земли в километрах
earth_radius_km = 6371

def haversine(lat1, lng1, lat2, lng2):
    # Преобразуем градусы в радианы
    lat1, lng1, lat2, lng2 = map(np.radians, [lat1, lng1, lat2, lng2])

    dlat = lat2 - lat1
    dlng = lng2 - lng1

    # Формула Хаверсина
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlng / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return earth_radius_km * c

# Добавление нового столбца в DataFrame
main_df['distance_from_center'] = haversine(main_df['lat'], main_df['lng'], center_lat, center_lng)


In [21]:
main_df[['lat', 'lng', 'distance_from_center']]

Unnamed: 0,lat,lng,distance_from_center
0,55.979979,37.188178,36.926101
1,55.712710,37.644477,4.774909
2,55.893965,37.380651,21.664143
3,55.868526,37.681218,13.318349
4,55.693186,37.476913,11.258577
...,...,...,...
65757,55.601533,37.742555,18.542465
65758,55.660856,37.763701,13.640801
65759,55.858978,37.475630,14.833499
65760,55.792316,37.694001,6.266953


In [22]:
main_df = main_df[main_df['is_duplicate'] != True]
main_df = main_df.drop(columns=['is_reliable','heat_type', 'name', 'gas_type', 'parking_type', 'windows_view','street', 'agent_name', 'house', 'is_duplicate', 'cian_id', 'lat', 'lng', 'price_changes', 'description'], axis=1)
main_df = main_df.dropna(subset=['district', 'county', 'sale_type'])

In [23]:
mask = main_df['project_type'] != 'Индивидуальный проект'

main_df.loc[mask, 'material_type'] = main_df.loc[mask].groupby('project_type')['material_type'].transform(lambda x: x.ffill().bfill())

In [24]:
main_df.loc[:, 'material_type'] = main_df['material_type'].fillna(main_df['material_type'].mode()[0])

In [25]:
main_df['material_type'].unique()

array(['ground', 'underground', 'multilevel', 'open', 'roof'],
      dtype=object)

In [26]:
main_df.loc[:, 'repair_type'] = main_df['repair_type'].fillna(main_df['repair_type'].mode()[0])

In [27]:
main_df.columns

Index(['county', 'district', 'metro', 'travel_type', 'travel_time', 'price',
       'category', 'views_count', 'photos_count', 'floor_number',
       'floors_count', 'publication_at', 'deal_type', 'flat_type', 'sale_type',
       'review_count', 'total_rate', 'realty_type', 'project_type',
       'is_apartment', 'is_penthouse', 'is_mortgage_allowed', 'is_premium',
       'is_emergency', 'renovation_programm', 'repair_type', 'total_area',
       'living_area', 'kitchen_area', 'ceiling_height', 'balconies', 'loggias',
       'rooms_count', 'separated_wc', 'combined_wc', 'build_year',
       'material_type', 'garbage_chute', 'passenger_lifts',
       'distance_from_center'],
      dtype='object')

In [28]:
# Extract features from the 'publication_at' datetime column
main_df['publication_at'] = pd.to_datetime(main_df['publication_at'])
main_df['year'] = main_df['publication_at'].dt.year
main_df['month'] = main_df['publication_at'].dt.month
main_df['day_of_week'] = main_df['publication_at'].dt.dayofweek
main_df['day_of_month'] = main_df['publication_at'].dt.day

main_df = main_df.drop(columns=['publication_at'])

In [29]:
main_df = main_df.applymap(lambda x: str(x) if isinstance(x, list) else x)

# Удаляем столбцы с единственным значением
main_df = main_df.loc[:, main_df.nunique() > 1]
main_df

  main_df = main_df.applymap(lambda x: str(x) if isinstance(x, list) else x)


Unnamed: 0,county,district,metro,travel_type,travel_time,price,category,views_count,photos_count,floor_number,...,combined_wc,build_year,material_type,garbage_chute,passenger_lifts,distance_from_center,year,month,day_of_week,day_of_month
0,ЗелАО,Старое Крюково,Зеленоград — Крюково,walk,13,14000000,flatSale,228,21,9,...,0,2022,ground,False,1,36.926101,2024,11,4,1
1,ЮАО,Даниловский,Тульская,walk,25,70000000,flatSale,1183,5,8,...,2,2016,underground,False,3,4.774909,2016,9,0,12
2,СЗАО,Куркино,Сходненская,transport,12,16900000,flatSale,3585,16,1,...,0,1920,ground,False,0,21.664143,2016,4,0,25
3,СВАО,Лосиноостровский,Бабушкинская,walk,14,66000000,flatSale,8408,36,6,...,2,2008,underground,True,1,13.318349,2016,11,0,7
4,ЗАО,Раменки,Мичуринский проспект,walk,12,58507560,newBuildingFlatSale,165,49,24,...,2,2025,underground,False,1,11.258577,2024,8,4,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65757,ЮАО,Орехово-Борисово Южное,Зябликово,walk,16,3000000,flatShareSale,283,9,1,...,0,1986,ground,True,2,18.542465,2024,12,1,3
65758,ЮВАО,Марьино,Братиславская,walk,10,3500000,flatShareSale,413,8,14,...,0,1999,ground,True,1,13.640801,2024,12,1,3
65759,САО,Левобережный,Речной вокзал,walk,7,14786265,newBuildingFlatSale,14,12,21,...,0,2025,ground,False,0,14.833499,2024,12,1,3
65760,ВАО,Сокольники,Сокольники,walk,14,15500000,flatSale,26,12,2,...,1,2013,ground,False,2,6.266953,2024,12,0,9


In [30]:
main_df.reset_index(drop=True)

Unnamed: 0,county,district,metro,travel_type,travel_time,price,category,views_count,photos_count,floor_number,...,combined_wc,build_year,material_type,garbage_chute,passenger_lifts,distance_from_center,year,month,day_of_week,day_of_month
0,ЗелАО,Старое Крюково,Зеленоград — Крюково,walk,13,14000000,flatSale,228,21,9,...,0,2022,ground,False,1,36.926101,2024,11,4,1
1,ЮАО,Даниловский,Тульская,walk,25,70000000,flatSale,1183,5,8,...,2,2016,underground,False,3,4.774909,2016,9,0,12
2,СЗАО,Куркино,Сходненская,transport,12,16900000,flatSale,3585,16,1,...,0,1920,ground,False,0,21.664143,2016,4,0,25
3,СВАО,Лосиноостровский,Бабушкинская,walk,14,66000000,flatSale,8408,36,6,...,2,2008,underground,True,1,13.318349,2016,11,0,7
4,ЗАО,Раменки,Мичуринский проспект,walk,12,58507560,newBuildingFlatSale,165,49,24,...,2,2025,underground,False,1,11.258577,2024,8,4,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52855,ЮАО,Орехово-Борисово Южное,Зябликово,walk,16,3000000,flatShareSale,283,9,1,...,0,1986,ground,True,2,18.542465,2024,12,1,3
52856,ЮВАО,Марьино,Братиславская,walk,10,3500000,flatShareSale,413,8,14,...,0,1999,ground,True,1,13.640801,2024,12,1,3
52857,САО,Левобережный,Речной вокзал,walk,7,14786265,newBuildingFlatSale,14,12,21,...,0,2025,ground,False,0,14.833499,2024,12,1,3
52858,ВАО,Сокольники,Сокольники,walk,14,15500000,flatSale,26,12,2,...,1,2013,ground,False,2,6.266953,2024,12,0,9


In [31]:
for i in main_df.describe():
  print(main_df.describe()[i])

count    52860.000000
mean        11.140806
std          5.897872
min          1.000000
25%          6.000000
50%         10.000000
75%         15.000000
max         52.000000
Name: travel_time, dtype: float64
count    5.286000e+04
mean     5.646295e+07
std      1.368693e+08
min      3.000000e+05
25%      1.269918e+07
50%      2.022210e+07
75%      4.450000e+07
max      6.450000e+09
Name: price, dtype: float64
count     52860.000000
mean        725.783447
std        2211.158622
min           0.000000
25%          16.000000
50%          62.000000
75%         453.000000
max      164396.000000
Name: views_count, dtype: float64
count    52860.000000
mean        17.650870
std          9.708003
min          1.000000
25%         11.000000
50%         16.000000
75%         23.000000
max         65.000000
Name: photos_count, dtype: float64
count    52860.000000
mean        10.063602
std          9.657488
min         -1.000000
25%          3.000000
50%          7.000000
75%         14.000000
max

In [32]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52860 entries, 0 to 65761
Data columns (total 41 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   county                52860 non-null  object 
 1   district              52860 non-null  object 
 2   metro                 52860 non-null  object 
 3   travel_type           52860 non-null  object 
 4   travel_time           52860 non-null  int64  
 5   price                 52860 non-null  int64  
 6   category              52860 non-null  object 
 7   views_count           52860 non-null  int64  
 8   photos_count          52860 non-null  int64  
 9   floor_number          52860 non-null  int64  
 10  floors_count          52860 non-null  int64  
 11  flat_type             52860 non-null  object 
 12  sale_type             52860 non-null  object 
 13  review_count          52860 non-null  int64  
 14  total_rate            52860 non-null  float64
 15  project_type          52

In [33]:
X = main_df.drop(columns=['price'])  # Все столбцы, кроме 'price', будут признаками
y = main_df['price']  # Целевая переменная

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, TargetEncoder, StandardScaler

# Onehot Encoding
onehot_columns = ['county', 'flat_type', 'sale_type', 'category', 'material_type', 'travel_type']
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
X_train_encoded = pd.DataFrame(onehot_encoder.fit_transform(X_train[onehot_columns]), columns=onehot_encoder.get_feature_names_out(onehot_columns))
X_test_encoded = pd.DataFrame(onehot_encoder.transform(X_test[onehot_columns]), columns=onehot_encoder.get_feature_names_out(onehot_columns))
X_train = pd.concat([X_train.drop(columns=onehot_columns).reset_index(drop=True), X_train_encoded], axis=1)
X_test = pd.concat([X_test.drop(columns=onehot_columns).reset_index(drop=True), X_test_encoded], axis=1)

# Ordinal Encoding для столбцов с упорядоченными категориями
ordinal_columns = {'repair_type': {'no': 0, 'cosmetic': 1, 'euro': 2, 'design': 3}}
for col, mapping in ordinal_columns.items():
    X_train[col] = X_train[col].map(mapping)
    X_test[col] = X_test[col].map(mapping)

# Target Encoding для столбцов с большим количеством уникальных категорий
target_columns = ['district', 'project_type', 'metro']
target_encoder = TargetEncoder(target_type='continuous')

# Применяем Target Encoding для столбцов с небольшим количеством уникальных категорий
X_train[target_columns]= pd.DataFrame(target_encoder.fit_transform(X_train[target_columns], y_train), columns=target_columns)
X_test[target_columns]= pd.DataFrame(target_encoder.transform(X_test[target_columns]), columns=target_columns)

# Scaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [36]:
X_train.shape, X_test.shape

((42288, 59), (10572, 59))

In [37]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42288 entries, 0 to 42287
Data columns (total 59 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   district                      42288 non-null  float64
 1   metro                         42288 non-null  float64
 2   travel_time                   42288 non-null  float64
 3   views_count                   42288 non-null  float64
 4   photos_count                  42288 non-null  float64
 5   floor_number                  42288 non-null  float64
 6   floors_count                  42288 non-null  float64
 7   review_count                  42288 non-null  float64
 8   total_rate                    42288 non-null  float64
 9   project_type                  42288 non-null  float64
 10  is_apartment                  42288 non-null  float64
 11  is_penthouse                  42288 non-null  float64
 12  is_mortgage_allowed           42288 non-null  float64
 13  i

# **Обучение моделей**

**Модель LinearRegression**

---



In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

model = LinearRegression()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print(f'''Metrics_train:
mse={mean_squared_error(y_train, train_pred)}
R2={r2_score(y_train, train_pred)}
mae={mean_absolute_error(y_train, train_pred)}
mape={mean_absolute_percentage_error(y_train, train_pred)}
''')
print(f'''Metrics_test:
mse={mean_squared_error(y_test, test_pred)}
R2={r2_score(y_test, test_pred)}
mae={mean_absolute_error(y_test, test_pred)}
mape={mean_absolute_percentage_error(y_test, test_pred)}
''')

Metrics_train:
mse=7169185497655377.0
R2=0.6239960106762912
mae=29743897.525535725
mape=0.9563515991434347

Metrics_test:
mse=4941360904622132.0
R2=0.7159659220659786
mae=29305584.20174403
mape=0.9733905846881912



In [39]:
coeff = pd.DataFrame(model.coef_, X_train.columns, columns=['coeff'])
coeff['coeff'].abs().sort_values(ascending=False).astype(int).head(10)

total_area      119893124
metro            33930033
rooms_count      21991368
living_area      21981007
build_year       12813772
floors_count      9179740
project_type      8771410
is_penthouse      8118572
county_ЗАО        7988191
views_count       7339111
Name: coeff, dtype: int64

In [40]:
model.intercept_

np.float64(56443638.45634619)

In [41]:
coeff['coeff'].abs().sort_values(ascending=True).astype(int).head(20)

sale_type_investment           29267
material_type_underground      37076
material_type_roof             81432
material_type_multilevel      219378
garbage_chute                 227309
county_ТАО (Троицкий)         244893
category_flatShareSale        252775
passenger_lifts               362867
sale_type_dzhsk               450098
county_ЮВАО                   619891
review_count                  634552
ceiling_height                669808
county_СВАО                   675773
sale_type_pdkp                711999
day_of_month                  772001
kitchen_area                  856812
sale_type_dupt                971481
flat_type_studio             1056281
travel_time                  1090174
sale_type_fz214              1133913
Name: coeff, dtype: int64

**Модель RidgeCV**

---



In [42]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

model_2 = RidgeCV(alphas=[0.001, 0.01, 0.5, 1, 10, 100, 1000, 10000, 25000, 50000, 100000], scoring='neg_mean_absolute_percentage_error')
model_2.fit(X_train, y_train)
train_pred_2 = model_2.predict(X_train)
test_pred_2 = model_2.predict(X_test)

print(f"Лучшее значение alpha: {model_2.alpha_}")

print(f'''Metrics_train:
mse={mean_squared_error(y_train, train_pred_2)}
R2={r2_score(y_train, train_pred_2)}
mae={mean_absolute_error(y_train, train_pred_2)}
mape={mean_absolute_percentage_error(y_train, train_pred_2)}
''')
print(f'''Metrics_test:
mse={mean_squared_error(y_test, test_pred_2)}
R2={r2_score(y_test, test_pred_2)}
mae={mean_absolute_error(y_test, test_pred_2)}
mape={mean_absolute_percentage_error(y_test, test_pred_2)}
''')


Лучшее значение alpha: 100000.0
Metrics_train:
mse=9787851038122750.0
R2=0.48665423171935496
mae=28286856.143591378
mape=0.7021009974148534

Metrics_test:
mse=7851178455694176.0
R2=0.5487068691395093
mae=28536082.255149562
mape=0.7182743836906713



**Модель Ridge**

---

In [43]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

model_3 = Ridge()

model_3.fit(X_train, y_train)
train_pred_3 = model_3.predict(X_train)
test_pred_3= model_3.predict(X_test)

print(f'''Metrics_train:
mse={mean_squared_error(y_train, train_pred_3)}
R2={r2_score(y_train, train_pred_3)}
mae={mean_absolute_error(y_train, train_pred_3)}
mape={mean_absolute_percentage_error(y_train, train_pred_3)}
''')
print(f'''Metrics_test:
mse={mean_squared_error(y_test, test_pred_3)}
R2={r2_score(y_test, test_pred_3)}
mae={mean_absolute_error(y_test, test_pred_3)}
mape={mean_absolute_percentage_error(y_test, test_pred_3)}
''')


Metrics_train:
mse=7169185691477735.0
R2=0.6239960005108436
mae=29742850.303624265
mape=0.9563154219284983

Metrics_test:
mse=4941423461371238.0
R2=0.7159623262451322
mae=29304768.90583379
mape=0.9733470829771005



In [44]:
coeff = pd.DataFrame(model_3.coef_, X_train.columns, columns=['coeff'])
coeff['coeff'].abs().sort_values(ascending=False).astype(int).head(10)

total_area      119834260
metro            33927883
rooms_count      21987663
living_area      21935903
build_year       12813807
floors_count      9180053
project_type      8771134
is_penthouse      8119985
county_ЗАО        7987157
views_count       7339171
Name: coeff, dtype: int64

**Модель LassoCV**

---

In [45]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, make_scorer

model_4 = LassoCV(alphas=[0.01, 1, 10, 100, 1000, 10000, 25000, 50000, 100000], max_iter=10000, cv=5)
model_4.fit(X_train, y_train)

print(f"Лучшее значение alpha: {model_4.alpha_}")

train_pred_4 = model_4.predict(X_train)
test_pred_4 = model_4.predict(X_test)

print(f'''Metrics_train:
mse={mean_squared_error(y_train, train_pred_4)}
R2={r2_score(y_train, train_pred_4)}
mae={mean_absolute_error(y_train, train_pred_4)}
mape={mean_absolute_percentage_error(y_train, train_pred_4)}
''')
print(f'''Metrics_test:
mse={mean_squared_error(y_test, test_pred_4)}
R2={r2_score(y_test, test_pred_4)}
mae={mean_absolute_error(y_test, test_pred_4)}
mape={mean_absolute_percentage_error(y_test, test_pred_4)}
''')


Лучшее значение alpha: 25000.0
Metrics_train:
mse=7169352891728903.0
R2=0.6239872313192143
mae=29688414.293079846
mape=0.9527486702310279

Metrics_test:
mse=4941037695336390.0
R2=0.71598450044008
mae=29250196.863590397
mape=0.9692648223905219



In [46]:
import numpy as np

def smape(y_true, y_pred):
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

In [47]:
smape(y_test, test_pred_4)


np.float64(81.69734229556974)

In [48]:
coeff = pd.DataFrame(model_4.coef_, X_train.columns, columns=['coeff'])
coeff['coeff'].abs().sort_values(ascending=False).astype(int).head(10)

total_area      119086026
metro            33890491
rooms_count      21951028
living_area      21328756
build_year       12766199
floors_count      9084511
project_type      8757383
is_penthouse      8121751
county_ЗАО        7587872
views_count       7285479
Name: coeff, dtype: int64

**Модель SGDRegressor**

---

In [49]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

model_5 = SGDRegressor(max_iter=10000)
model_5.fit(X_train, y_train)

train_pred_5 = model_5.predict(X_train)
test_pred_5 = model_5.predict(X_test)

print(f'''Metrics_train:
mse={mean_squared_error(y_train, train_pred_5):.6f}
R2={r2_score(y_train, train_pred_5):.6f}
mae={mean_absolute_error(y_train, train_pred_5):.6f}
mape={mean_absolute_percentage_error(y_train, train_pred_5):.6f}
''')

print(f'''Metrics_test:
mse={mean_squared_error(y_test, test_pred_5):.6f}
R2={r2_score(y_test, test_pred_5):.6f}
mae={mean_absolute_error(y_test, test_pred_5):.6f}
mape={mean_absolute_percentage_error(y_test, test_pred_5):.6f}
''')

Metrics_train:
mse=3334058911899098546176.000000
R2=-174861.186496
mae=1685335725.029888
mape=86.052533

Metrics_test:
mse=2578242404935098957824.000000
R2=-148198.801292
mae=1527200015.885394
mape=105.966851



In [50]:
coeff = pd.DataFrame(model_5.coef_, X_train.columns, columns=['coeff'])
coeff['coeff'].abs().sort_values(ascending=False).astype(int).head(10)

sale_type_dzhsk         43289256307
sale_type_investment    35884585183
is_emergency             9391690906
material_type_roof       9007137832
county_ЮЗАО               686066206
review_count              531111386
project_type              393628397
sale_type_fz214           377992073
total_area                355345494
distance_from_center      355341027
Name: coeff, dtype: int64

**Модель ElasticNetCV**

---

In [51]:
from sklearn.linear_model import ElasticNetCV

elastic_model = ElasticNetCV(
    alphas=[0.001, 0.01, 0.5, 1, 10],
    l1_ratio=[0.2, 0.3, 0.5, 0.7, 0.8],
    max_iter=10000,
    cv=10,
)
elastic_model.fit(X_train, y_train)
train_pred_6 = elastic_model.predict(X_train)
test_pred_6 = elastic_model.predict(X_test)

print(f"Best alpha: {elastic_model.alpha_}")
print(f"Best l1_ratio: {elastic_model.l1_ratio_}\n")

print(f'''Metrics_train:
mse={mean_squared_error(y_train, train_pred_6):.6f}
R2={r2_score(y_train, train_pred_6):.6f}
mae={mean_absolute_error(y_train, train_pred_6):.6f}
mape={mean_absolute_percentage_error(y_train, train_pred_6):.6f}
''')

print(f'''Metrics_test:
mse={mean_squared_error(y_test, test_pred_6):.6f}
R2={r2_score(y_test, test_pred_6):.6f}
mae={mean_absolute_error(y_test, test_pred_6):.6f}
mape={mean_absolute_percentage_error(y_test, test_pred_6):.6f}
''')

Best alpha: 0.001
Best l1_ratio: 0.2

Metrics_train:
mse=7169397572875873.000000
R2=0.623985
mae=29709751.622034
mape=0.955157

Metrics_test:
mse=4943608660803409.000000
R2=0.715837
mae=29279794.094482
mape=0.971967



**Модель LinearRegression + PCA**

---

In [52]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

model_7 = LinearRegression()
model_7.fit(X_train_pca, y_train)

train_pred_7 = model_7.predict(X_train_pca)
test_pred_7 = model_7.predict(X_test_pca)

print(f'''Metrics_train:
mse={mean_squared_error(y_train, train_pred_7):.6f}
R2={r2_score(y_train, train_pred_7):.6f}
mae={mean_absolute_error(y_train, train_pred_7):.6f}
mape={mean_absolute_percentage_error(y_train, train_pred_7):.6f}
''')

print(f'''Metrics_test:
mse={mean_squared_error(y_test, test_pred_7):.6f}
R2={r2_score(y_test, test_pred_7):.6f}
mae={mean_absolute_error(y_test, test_pred_7):.6f}
mape={mean_absolute_percentage_error(y_test, test_pred_7):.6f}
''')

Metrics_train:
mse=8259686679467934.000000
R2=0.566802
mae=35737207.781084
mape=1.248634

Metrics_test:
mse=6041037511445076.000000
R2=0.652755
mae=35642602.542544
mape=1.275289



**Модель Ridge + GridSearchCV + PolynomialFeatures**

---

In [53]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    # ('scaler', StandardScaler()), он не нужен. данные уже скейлили
    ('ridge', Ridge(solver='svd', max_iter=10000))
])

# param_grid = {'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 25000, 35000, 100000],
#               'poly__degree': [1, 2]}
# уже запускали, лучшие снизу
param_grid = {'ridge__alpha': [25000],
              'poly__degree': [2]}

plm_model = GridSearchCV(pipeline, param_grid, cv=10, n_jobs=3)
plm_model.fit(X_train, y_train)

polynom_model = plm_model.best_estimator_
y_train_polynom = polynom_model.predict(X_train)
y_test_polynom = polynom_model.predict(X_test)

print(f'''Metrics_train:
mse={mean_squared_error(y_train, y_train_polynom)}
R2={r2_score(y_train, y_train_polynom)}
mae={mean_absolute_error(y_train, y_train_polynom)}
mape={mean_absolute_percentage_error(y_train, y_train_polynom)}
''')
print(f'''Metrics_test:
mse={mean_squared_error(y_test, y_test_polynom)}
R2={r2_score(y_test, y_test_polynom)}
mae={mean_absolute_error(y_test, y_test_polynom)}
mape={mean_absolute_percentage_error(y_test, y_test_polynom)}
''')

Metrics_train:
mse=3813135770733826.5
R2=0.8000115547159784
mae=18809634.27373614
mape=0.5328827825726944

Metrics_test:
mse=3055796186050328.5
R2=0.8243499576711294
mae=19570447.273024276
mape=0.568998758266876



In [54]:
poly_features = polynom_model.named_steps['poly'].get_feature_names_out(X_train.columns)
coeff = pd.DataFrame(polynom_model.named_steps['ridge'].coef_, poly_features, columns=['coeff'])
coeff['coeff'].abs().sort_values(ascending=False).head(10).astype(int)

metro total_area                   9876292
total_area                         5437754
metro kitchen_area                 5338676
total_area county_ЦАО              4900424
district total_area                4830489
living_area                        4486848
total_area^2                       4436844
views_count is_penthouse           3988824
total_area distance_from_center    3982291
distance_from_center               3853488
Name: coeff, dtype: int64

In [55]:
coeff['coeff'].abs().sort_values(ascending=True).head(10).astype(int)

sale_type_dzhsk material_type_roof          0
is_emergency county_ТАО (Троицкий)          0
sale_type_investment material_type_roof     0
sale_type_dzhsk sale_type_investment        0
is_emergency material_type_roof             0
is_emergency sale_type_dzhsk                0
is_emergency sale_type_investment           0
is_emergency sale_type_pdkp                 0
sale_type_pdkp material_type_roof           1
county_ТАО (Троицкий) material_type_roof    1
Name: coeff, dtype: int64

In [56]:
import pickle
with open('model.pickle', 'wb') as file:
    pickle.dump({
        'model': polynom_model,
        'onehot_encoder': onehot_encoder,
        'target_encoder': target_encoder,
        'scaler': scaler
    }, file)