# Курс Специализация Data Science Дипломный проект. Бриф учебного кейса 
## «Модель прогнозирования стоимости жилья для агентства недвижимости»

В данном ноутбуке мы сделаем следующее:
* Обучим модель при помощи алгоритма Random Forest т.к. он работает быстро и достаточно хорошо
* Сохраним модель и названия колонок в отдельных файлах
* Напишем функцию для предсказания стоимости квартиры по входным параметрам
* Сохраним наиболее привлекательные варианты в json-файл

In [14]:
import random
import numpy as np 
import pandas as pd 
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold
from sklearn import metrics
from tqdm.notebook import tqdm
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

In [249]:
# зафиксируем RANDOM_SEED, чтобы эксперименты были воспроизводимы
RANDOM_SEED = 42

TEST_SIZE = 0.2

In [250]:
def preproc_data(df_input):
    '''includes several functions to pre-process the predictor data.'''
    
    df_output = df_input.copy()
    # переведем признак зип кода в категориальный
    df_output['zipcode'] = df_output['zipcode'].astype(str)
    # переведем признак год в категориальный
    df_output['Year built'] = df_output['Year built'].astype(str)
    # Нормализация данных
    scaler = MinMaxScaler()
    for column in ['baths', 'sqft', 'target', 'school_rating _mean', 'school_dist_min']:
        df_output[column] = scaler.fit_transform(df_output[[column]])[:,0]

        
    # ################### Categorical Features ############################################################## 
 
    ohe_status = OneHotEncoder(sparse=False)
    ohe_state = OneHotEncoder(sparse=False)
    ohe_Type = OneHotEncoder(sparse=False)

    status_ohe = ohe_status.fit_transform(df_output['status'].values.reshape(-1,1))
    state_ohe = ohe_state.fit_transform(df_output['state'].values.reshape(-1,1))
    Type_ohe = ohe_Type.fit_transform(df_output['Type'].values.reshape(-1,1))

    le = LabelEncoder()
    state_label = le.fit_transform(df_output['state'])

    year_le = LabelEncoder()
    year_ord = year_le.fit_transform(df_output['Year built'])

    city_le = LabelEncoder()
    city_label = city_le.fit_transform(df_output['city'])

    zip_le = LabelEncoder()
    zip_label = zip_le.fit_transform(df_output['zipcode'])

    # Adding encoded categorical features to the output dataframe
    df_output = df_output.join(pd.DataFrame(status_ohe, columns=['status_' + str(cat) for cat in ohe_status.categories_[0]]))
    df_output = df_output.join(pd.DataFrame(state_ohe, columns=['state_' + str(cat) for cat in ohe_state.categories_[0]]))
    df_output = df_output.join(pd.DataFrame(Type_ohe, columns=['Type_' + str(cat) for cat in ohe_Type.categories_[0]]))
    df_output['state_label'] = state_label
    df_output['year_ord'] = year_ord
    df_output['city_label'] = city_label
    df_output['zip_label'] = zip_label

    # Dropping original categorical columns
    df_output.drop(['status', 'state', 'Type', 'city', 'zipcode','Year built'], axis=1, inplace=True)
    
    return df_output

In [251]:
df = pd.read_csv('data/data_model.csv')
df.head()

Unnamed: 0,status,baths,city,sqft,zipcode,state,target,pool_encoded,Type,Year built,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min
0,Active,4.0,Southern Pines,2900,28387,NC,418000,False,single_family_home,2019,True,False,False,True,5.2,2.7
1,For Sale,3.0,Spokane Valley,1947,99216,WA,310000,False,single_family_home,2019,False,False,False,False,4.0,1.01
2,Active,2.0,Mason,3588,50401,IA,244900,False,single_family_home,1970,True,True,False,False,3.8,5.6
3,Other,3.0,Houston,1930,77080,TX,311995,False,single_family_home,2019,True,True,True,False,3.0,0.6
4,For Sale,2.0,Flushing,1300,11354,NY,669000,False,condo,1965,False,False,True,False,2.8,0.3


In [258]:
# Запускаем и проверяем, что получилось
df_encoded = preproc_data(df)
df_encoded.head(10)

Unnamed: 0,baths,sqft,target,pool_encoded,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min,...,Type_modern,Type_multi_family_home,Type_other,Type_ranch,Type_single_family_home,Type_townhouse,state_label,year_ord,city_label,zip_label
0,0.103448,0.016379,0.556742,False,True,False,False,True,0.62,0.110778,...,0.0,0.0,0.0,0.0,1.0,0.0,18,201,1272,755
1,0.068966,0.010994,0.41255,False,False,False,False,False,0.5,0.06018,...,0.0,0.0,0.0,0.0,1.0,0.0,32,201,1281,3954
2,0.034483,0.020266,0.325634,False,True,True,False,False,0.48,0.197605,...,0.0,0.0,0.0,0.0,1.0,0.0,7,152,802,1955
3,0.068966,0.010898,0.415214,False,True,True,True,False,0.4,0.047904,...,0.0,0.0,0.0,0.0,1.0,0.0,28,201,608,2514
4,0.034483,0.007339,0.891856,False,False,False,True,False,0.38,0.038922,...,0.0,0.0,0.0,0.0,0.0,0.0,21,147,450,133
5,0.068966,0.016034,0.699599,True,True,True,True,False,0.83,0.057485,...,0.0,0.0,0.0,0.0,1.0,0.0,5,178,1022,1037
6,0.034483,0.010277,0.666088,False,True,False,True,False,0.83,0.062874,...,0.0,0.0,0.0,0.0,1.0,0.0,24,201,422,3760
7,0.068966,0.013859,0.224032,False,True,True,True,False,0.63,0.047904,...,0.0,0.0,0.0,0.0,1.0,0.0,28,164,608,2518
8,0.034483,0.012441,0.445928,False,False,False,False,False,0.45,0.04491,...,0.0,0.0,1.0,0.0,0.0,0.0,5,190,459,1121
9,0.103448,0.017396,0.485981,False,True,True,True,True,0.8,0.083234,...,0.0,0.0,0.0,0.0,1.0,0.0,28,198,1375,2543


# Обучение модели

In [253]:
y = df_encoded.target.values
X = df_encoded.drop(['target'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, shuffle=True, random_state=RANDOM_SEED)

## Model 3: RandomForestRegressor

In [254]:
# Создаем экземпляр модели RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=RANDOM_SEED)

# Обучаем модель на обучающих данных
rf_regressor.fit(X_train, y_train)

# Предсказания на обучающих и тестовых данных
y_train_pred = rf_regressor.predict(X_train)
y_test_pred = rf_regressor.predict(X_test)

# Вычисляем метрики
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

# Выводим метрики
print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.00
Test MSE: 0.01
Train MAE: 0.02
Test MAE: 0.06
Train R2: 0.97
Test R2: 0.82


## Построение прогноза

In [260]:
# создадим тестовый набор 
data = [
    ('For Sale', 3.0, 'Bloomington', 1595, 61704, 'IL', 189900, False, 'single_family_home', 'no date', True, True, True, False, 6.3, 2.60),
    ('Pending', 2.0, 'Palm Coast', 1587, 32164, 'FL', 229000, False, 'single_family_home', 1999, True, True, False, False, 5.0, 0.60),
    ('Foreclosure', 2.0, 'Burien', 1790, 98166, 'WA', 493999, False, 'single_family_home', 1954, True, True, True, True, 2.3, 0.30),
    ('For Sale', 2.0, 'Mint Hill', 1731, 28227, 'NC', 331990, False, 'single_family_home', 'no date', False, False, False, False, 3.0, 1.20),
    ('For Sale', 2.0, 'Lauderdale Lakes', 960, 33313, 'FL', 74900, False, 'condo', 1977, True, True, False, False, 3.0, 0.28),
    ('Other', 4.0, 'Jacksonville', 2492, 32218, 'FL', 285990, False, 'single_family_home', 2019, True, True, True, False, 4.7, 2.70)
]

columns = ['status', 'baths', 'city', 'sqft', 'zipcode', 'state', 'target', 'pool_encoded', 'Type', 'Year built', 'Heating_encoded', 'Cooling_encoded', 'Parking_encoded', 'fireplace_encoded', 'school_rating _mean', 'school_dist_min']

df_test = pd.DataFrame(data, columns=columns)

df_test.head()

Unnamed: 0,status,baths,city,sqft,zipcode,state,target,pool_encoded,Type,Year built,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min
0,For Sale,3.0,Bloomington,1595,61704,IL,189900,False,single_family_home,no date,True,True,True,False,6.3,2.6
1,Pending,2.0,Palm Coast,1587,32164,FL,229000,False,single_family_home,1999,True,True,False,False,5.0,0.6
2,Foreclosure,2.0,Burien,1790,98166,WA,493999,False,single_family_home,1954,True,True,True,True,2.3,0.3
3,For Sale,2.0,Mint Hill,1731,28227,NC,331990,False,single_family_home,no date,False,False,False,False,3.0,1.2
4,For Sale,2.0,Lauderdale Lakes,960,33313,FL,74900,False,condo,1977,True,True,False,False,3.0,0.28


In [261]:
# Обработка новых данных функцией preproc_data
preprocessed_df_test = preproc_data(df_test)
# Удаление колонки target, поскольку она будет предсказана моделью
X_test = preprocessed_df_test.drop('target', axis=1)
columns_order = ['baths', 'sqft', 'pool_encoded', 'Heating_encoded',
                'Cooling_encoded', 'Parking_encoded', 'fireplace_encoded',
                'school_rating _mean', 'school_dist_min', 'status_Active',
                'status_Auction', 'status_Back on Market', 'status_Coming Soon',
                'status_Contingent', 'status_For Rent', 'status_For Sale',
                'status_Foreclosure', 'status_Other', 'status_Pending',
                'status_Price Change', 'status_Under Contract', 'state_AZ', 'state_CA',
                'state_CO', 'state_DC', 'state_DE', 'state_FL', 'state_GA', 'state_IA',
                'state_IL', 'state_IN', 'state_KY', 'state_MA', 'state_MD', 'state_ME',
                'state_MI', 'state_MO', 'state_MS', 'state_MT', 'state_NC', 'state_NJ',
                'state_NV', 'state_NY', 'state_OH', 'state_OK', 'state_OR', 'state_PA',
                'state_SC', 'state_TN', 'state_TX', 'state_UT', 'state_VA', 'state_VT',
                'state_WA', 'state_WI', 'Type_apartment', 'Type_condo',
                'Type_historical', 'Type_land', 'Type_miscellaneous',
                'Type_mobile_home', 'Type_modern', 'Type_multi_family_home',
                'Type_other', 'Type_ranch', 'Type_single_family_home', 'Type_townhouse',
                'state_label', 'year_ord', 'city_label', 'zip_label']

for column in columns_order:
    if column not in X_test.columns:
        X_test[column] = 0

# Переупорядочить столбцы в соответствии с заданным списком
X_test = X_test[columns_order]

# Выведите обновленный датафрейм
X_test.head()

Unnamed: 0,baths,sqft,pool_encoded,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min,status_Active,...,Type_modern,Type_multi_family_home,Type_other,Type_ranch,Type_single_family_home,Type_townhouse,state_label,year_ord,city_label,zip_label
0,0.5,0.414491,False,True,True,True,False,1.0,0.958678,0,...,0,0,0,0,1.0,0,1,4,0,4
1,0.0,0.409269,False,True,True,False,False,0.675,0.132231,0,...,0,0,0,0,1.0,0,0,2,5,1
2,0.0,0.541775,False,True,True,True,True,0.0,0.008264,0,...,0,0,0,0,1.0,0,3,0,1,5
3,0.0,0.503264,False,False,False,False,False,0.175,0.380165,0,...,0,0,0,0,1.0,0,2,4,4,0
4,0.0,0.0,False,True,True,False,False,0.175,0.0,0,...,0,0,0,0,0.0,0,0,1,3,3


In [257]:
# Получаем предсказание целевой переменной для тестовых данных
target = rf_regressor.predict(X_test)

# Выводим предсказание
print(target)

[0.5061431  0.58886502 0.50358716 0.58997033 0.42735565 0.4920421 ]
