# Курс Специализация Data Science Дипломный проект. Бриф учебного кейса 
## «Модель прогнозирования стоимости жилья для агентства недвижимости»

В данном ноутбуке мы сделаем следующее:
* Подгрузим обученную модель CatBoostRegressor
* Сохраним названия колонок в отдельных файлах
* Напишем функцию для предсказания стоимости недвижимости по входным параметрам
* Сохраним наиболее привлекательные варианты в json-файл

In [4]:
import random
import numpy as np 
import pandas as pd 
import sys

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import KFold
from sklearn import metrics
from tqdm.notebook import tqdm
from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from sklearn.pipeline import Pipeline
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
# зафиксируем RANDOM_SEED, чтобы эксперименты были воспроизводимы
RANDOM_SEED = 42
TEST_SIZE = 0.2

In [29]:
def log_data(df_input):
    '''includes several functions to pre-process the predictor data.'''
    
    df_output = df_input.copy()
    # переведем признак зип кода в категориальный
    df_output['zipcode'] = df_output['zipcode'].astype(str)
    # переведем признак год в категориальный
    df_output['Year built'] = df_output['Year built'].astype(str)
    # Нормализация данных и логорифмирование
    #scaler = MinMaxScaler()
    for column in ['baths', 'sqft', 'school_rating _mean', 'school_dist_min']:
        #df_output[column] = scaler.fit_transform(df_output[[column]])[:,0]
        # Логорифмирование
        df_output[column] = df_output[column].apply(lambda x: abs(x))
        constant = 1e-6
        df_output[column] = np.log(df_output[column] + constant)
    return df_output

In [22]:
# создадим тестовый набор 
data = [
    ('Active', 3.0, 'Washington', 801, 20004, 'DC', False, 'other', 1991, False, False, False, False, 6.0, 0.2),
    ('Active', 2.0, 'Dallas', 832, 75219, 'TX', False, 'condo', 1998, False, True, False, False, 2.6, 0.6),
    ('Active', 4.0, 'Dallas', 2102, 75219, 'TX', False, 'condo', 2019, False, True, False, False, 2.0, 0.1),
    ('Active', 2.0, 'Dallas', 837, 75219, 'TX', False, 'condo', 1998, False, True, False, False, 2.6, 0.6),
    ('Active', 2.0, 'Dallas', 2785, 75219, 'TX', False, 'condo', 2015, False, True, False, True, 2.6, 0.7)
]

columns = ['status', 'baths', 'city', 'sqft', 'zipcode', 'state', 'pool_encoded', 'Type', 'Year built', 'Heating_encoded', 'Cooling_encoded', 'Parking_encoded', 'fireplace_encoded', 'school_rating _mean', 'school_dist_min']

df_test = pd.DataFrame(data, columns=columns)

df_test.head()

Unnamed: 0,status,baths,city,sqft,zipcode,state,pool_encoded,Type,Year built,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min
0,Active,3.0,Washington,801,20004,DC,False,other,1991,False,False,False,False,6.0,0.2
1,Active,2.0,Dallas,832,75219,TX,False,condo,1998,False,True,False,False,2.6,0.6
2,Active,4.0,Dallas,2102,75219,TX,False,condo,2019,False,True,False,False,2.0,0.1
3,Active,2.0,Dallas,837,75219,TX,False,condo,1998,False,True,False,False,2.6,0.6
4,Active,2.0,Dallas,2785,75219,TX,False,condo,2015,False,True,False,True,2.6,0.7


In [36]:
X_test = log_data(df_test)
X_test.head(5)

Unnamed: 0,status,baths,city,sqft,zipcode,state,pool_encoded,Type,Year built,Heating_encoded,Cooling_encoded,Parking_encoded,fireplace_encoded,school_rating _mean,school_dist_min
0,Active,1.098613,Washington,6.685861,20004,DC,False,other,1991,False,False,False,False,1.79176,-1.609433
1,Active,0.693148,Dallas,6.723832,75219,TX,False,condo,1998,False,True,False,False,0.955512,-0.510824
2,Active,1.386295,Dallas,7.650645,75219,TX,False,condo,2019,False,True,False,False,0.693148,-2.302575
3,Active,0.693148,Dallas,6.729824,75219,TX,False,condo,1998,False,True,False,False,0.955512,-0.510824
4,Active,0.693148,Dallas,7.932003,75219,TX,False,condo,2015,False,True,False,True,0.955512,-0.356674


In [20]:
# Загрузка сохраненной модели из файла pickle
with open("best_cb_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [37]:
# протестируем воспроизводимость модели
y_test_pred_loaded = loaded_model.predict(X_test)
target = np.exp(y_test_pred_loaded)
rounded_target = np.round(target)
print(rounded_target)

[506389. 224225. 551250. 224081. 626743.]


In [27]:
# сохраним названия колонок
import json
columns = {
    'data_columns' : [col.lower() for col in X_test.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

In [28]:
#сохраним тестовый датасет
cheap_flats = df_test.to_json(orient="index")
with open("cheap_flats.json","w") as f:
    f.write(json.dumps(cheap_flats))