# Курс Специализация Data Science Дипломный проект. Бриф учебного кейса 
## «Модель прогнозирования стоимости жилья для агентства недвижимости»

В данном ноутбуке мы сделаем следующее:
* Обучим модель при помощи алгоритма Random Forest т.к. он работает быстро и достаточно хорошо
* Сохраним модель и названия колонок в отдельных файлах
* Напишем функцию для предсказания стоимости квартиры по входным параметрам
* Сохраним наиболее привлекательные варианты в json-файл

In [1]:
import random
import numpy as np 
import pandas as pd 
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold
from sklearn import metrics
from tqdm.notebook import tqdm
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

In [249]:
# зафиксируем RANDOM_SEED, чтобы эксперименты были воспроизводимы
RANDOM_SEED = 42

TEST_SIZE = 0.2

In [2]:
def preproc_data(df_input):
    '''includes several functions to pre-process the predictor data.'''
    
    df_output = df_input.copy()
    # переведем признак зип кода в категориальный
    df_output['zipcode'] = df_output['zipcode'].astype(str)
    # переведем признак год в категориальный
    df_output['Year built'] = df_output['Year built'].astype(str)
    # Нормализация и логорифмирование данных
    #scaler = MinMaxScaler()
    for column in ['baths', 'sqft', 'target', 'school_rating _mean', 'school_dist_min']:
        #df_output[column] = scaler.fit_transform(df_output[[column]])[:,0]
        # Логорифмирование
        df_output[column] = df_output[column].apply(lambda x: abs(x))
        constant = 1e-6
        df_output[column] = np.log(df_output[column] + constant)
        
    # ################### Categorical Features ############################################################## 
 
    ohe_status = OneHotEncoder(sparse=False)
    ohe_state = OneHotEncoder(sparse=False)
    ohe_Type = OneHotEncoder(sparse=False)

    status_ohe = ohe_status.fit_transform(df_output['status'].values.reshape(-1,1))
    state_ohe = ohe_state.fit_transform(df_output['state'].values.reshape(-1,1))
    Type_ohe = ohe_Type.fit_transform(df_output['Type'].values.reshape(-1,1))

    le = LabelEncoder()
    state_label = le.fit_transform(df_output['state'])

    year_le = LabelEncoder()
    year_ord = year_le.fit_transform(df_output['Year built'])

    city_le = LabelEncoder()
    city_label = city_le.fit_transform(df_output['city'])

    zip_le = LabelEncoder()
    zip_label = zip_le.fit_transform(df_output['zipcode'])

    # Adding encoded categorical features to the output dataframe
    df_output = df_output.join(pd.DataFrame(status_ohe, columns=['status_' + str(cat) for cat in ohe_status.categories_[0]]))
    df_output = df_output.join(pd.DataFrame(state_ohe, columns=['state_' + str(cat) for cat in ohe_state.categories_[0]]))
    df_output = df_output.join(pd.DataFrame(Type_ohe, columns=['Type_' + str(cat) for cat in ohe_Type.categories_[0]]))
    df_output['state_label'] = state_label
    df_output['year_ord'] = year_ord
    df_output['city_label'] = city_label
    df_output['zip_label'] = zip_label

    # Dropping original categorical columns
    df_output.drop(['status', 'state', 'Type', 'city', 'zipcode','Year built'], axis=1, inplace=True)
    
    return df_output

In [3]:
df = pd.read_csv('data/data_model.csv')
df.head(3)

Unnamed: 0,status,baths,city,sqft,zipcode,state,target,pool_encoded,Type,Year built,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min
0,Active,4.0,Southern Pines,2900,28387,NC,418000,False,single_family_home,2019,True,False,False,True,5.2,2.7
1,For Sale,3.0,Spokane Valley,1947,99216,WA,310000,False,single_family_home,2019,False,False,False,False,4.0,1.01
2,Active,2.0,Mason,3588,50401,IA,244900,False,single_family_home,1970,True,True,False,False,3.8,5.6


In [4]:
# Запускаем и проверяем, что получилось
df_encoded = preproc_data(df)
df_encoded.head(3)

Unnamed: 0,baths,sqft,target,pool_encoded,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min,...,Type_modern,Type_multi_family_home,Type_other,Type_ranch,Type_single_family_home,Type_townhouse,state_label,year_ord,city_label,zip_label
0,1.386295,7.972466,12.943237,False,True,False,False,True,1.648659,0.993252,...,0.0,0.0,0.0,0.0,1.0,0.0,18,201,1272,755
1,1.098613,7.574045,12.644328,False,False,False,False,False,1.386295,0.009951,...,0.0,0.0,0.0,0.0,1.0,0.0,32,201,1281,3954
2,0.693148,8.18535,12.408605,False,True,True,False,False,1.335001,1.722767,...,0.0,0.0,0.0,0.0,1.0,0.0,7,152,802,1955


# Обучение модели

In [280]:
y = df_encoded.target.values
X = df_encoded.drop(['target'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, shuffle=True, random_state=RANDOM_SEED)

## Model 3: RandomForestRegressor

In [281]:
# Создаем экземпляр модели RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=RANDOM_SEED)

# Обучаем модель на обучающих данных
rf_regressor.fit(X_train, y_train)

# Предсказания на обучающих и тестовых данных
y_train_pred = rf_regressor.predict(X_train)
y_test_pred = rf_regressor.predict(X_test)

# Вычисляем метрики
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

# Выводим метрики
print(f"Train MSE: {mse_train:.2f}")
print(f"Test MSE: {mse_test:.2f}")
print(f"Train MAE: {mae_train:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Train R2: {r2_train:.2f}")
print(f"Test R2: {r2_test:.2f}")

Train MSE: 0.01
Test MSE: 0.07
Train MAE: 0.06
Test MAE: 0.16
Train R2: 0.98
Test R2: 0.84


## Построение прогноза

In [294]:
# создадим тестовый набор 
data = [
    ('Active', 3.0, 'Washington', 801, 20004, 'DC', 0, False, 'other', 1991, False, False, False, False, 6.0, 0.2),
    ('Active', 2.0, 'Dallas', 832, 75219, 'TX', 0, False, 'condo', 1998, False, True, False, False, 2.6, 0.6),
    ('Active', 4.0, 'Dallas', 2102, 75219, 'TX', 0, False, 'condo', 2019, False, True, False, False, 2.0, 0.1),
    ('Active', 2.0, 'Dallas', 837, 75219, 'TX', 0, False, 'condo', 1998, False, True, False, False, 2.6, 0.6),
    ('Active', 2.0, 'Dallas', 2785, 75219, 'TX', 0, False, 'condo', 2015, False, True, False, True, 2.6, 0.7)
]

columns = ['status', 'baths', 'city', 'sqft', 'zipcode', 'state', 'target', 'pool_encoded', 'Type', 'Year built', 'Heating_encoded', 'Cooling_encoded', 'Parking_encoded', 'fireplace_encoded', 'school_rating _mean', 'school_dist_min']

df_test = pd.DataFrame(data, columns=columns)

df_test.head()

Unnamed: 0,status,baths,city,sqft,zipcode,state,target,pool_encoded,Type,Year built,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min
0,Active,3.0,Washington,801,20004,DC,0,False,other,1991,False,False,False,False,6.0,0.2
1,Active,2.0,Dallas,832,75219,TX,0,False,condo,1998,False,True,False,False,2.6,0.6
2,Active,4.0,Dallas,2102,75219,TX,0,False,condo,2019,False,True,False,False,2.0,0.1
3,Active,2.0,Dallas,837,75219,TX,0,False,condo,1998,False,True,False,False,2.6,0.6
4,Active,2.0,Dallas,2785,75219,TX,0,False,condo,2015,False,True,False,True,2.6,0.7


In [297]:
# напишем функцию которая проведет кодировку тестового набора данных, 
# передаст его в модель и выведет предсказание.
def process_and_predict(df_test, preproc_data, rf_regressor):
    preprocessed_df_test = preproc_data(df_test)
    X_test = preprocessed_df_test.drop('target', axis=1)
    columns_order = ['baths', 'sqft', 'pool_encoded', 'Heating_encoded',
                    'Cooling_encoded', 'Parking_encoded', 'fireplace_encoded',
                    'school_rating _mean', 'school_dist_min', 'status_Active',
                    'status_Auction', 'status_Back on Market', 'status_Coming Soon',
                    'status_Contingent', 'status_For Rent', 'status_For Sale',
                    'status_Foreclosure', 'status_Other', 'status_Pending',
                    'status_Price Change', 'status_Under Contract', 'state_AZ', 'state_CA',
                    'state_CO', 'state_DC', 'state_DE', 'state_FL', 'state_GA', 'state_IA',
                    'state_IL', 'state_IN', 'state_KY', 'state_MA', 'state_MD', 'state_ME',
                    'state_MI', 'state_MO', 'state_MS', 'state_MT', 'state_NC', 'state_NJ',
                    'state_NV', 'state_NY', 'state_OH', 'state_OK', 'state_OR', 'state_PA',
                    'state_SC', 'state_TN', 'state_TX', 'state_UT', 'state_VA', 'state_VT',
                    'state_WA', 'state_WI', 'Type_apartment', 'Type_condo',
                    'Type_historical', 'Type_land', 'Type_miscellaneous',
                    'Type_mobile_home', 'Type_modern', 'Type_multi_family_home',
                    'Type_other', 'Type_ranch', 'Type_single_family_home', 'Type_townhouse',
                    'state_label', 'year_ord', 'city_label', 'zip_label']

    for column in columns_order:
        if column not in X_test.columns:
            X_test[column] = 0

    X_test = X_test[columns_order]
    target_log = rf_regressor.predict(X_test)
    target = np.exp(target_log)
    rounded_target = np.round(target)
    return rounded_target

In [298]:
# вызовим функцию:(датасет, функция кодировщик,модель)
rounded_target = process_and_predict(df_test, preproc_data, rf_regressor)
print(rounded_target)

[206810. 185256. 351908. 185256. 314019.]


# Подготовка модели к продакшену

In [310]:
# сохраним модель
import pickle
with open('model.pickle','wb') as f:
    pickle.dump(rf_regressor,f)

In [311]:
# сохраним названия колонок
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [314]:
cheap_flats = df_test.to_json(orient="index")
with open("cheap_flats.json","w") as f:
    f.write(json.dumps(cheap_flats))