<a href="https://colab.research.google.com/github/mirsaidl/Machine-Learning/blob/main/Tashkent%20House%20Price%20Prediction/tashkent_housep_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# Tashkent home price # analysis
import numpy as np
import pandas as pd
import sklearn
import matplotlib as plt
import seaborn as sns

df = pd.read_csv('https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv')
df.head(3)

# data preprocessing
df.drop(df[df.price == 'Договорная'].index, axis=0, inplace=True)
df.drop('location', axis=1, inplace=True)
df.loc[df[df['size'] == 'Площадьземли:1сот'].index, 'size'] = 100
df.price = df.price.astype(int)

In [43]:
# Tran-Test set
from sklearn.model_selection import  train_test_split
train_set, test_set = train_test_split(df, test_size=0.2, random_state=30)

x_train = train_set.drop('price', axis=1)
y = train_set['price'].copy()

x_num = x_train.drop('district', axis=1)
x_num

Unnamed: 0,rooms,size,level,max_levels
3351,2,63,1,9
849,3,82,3,7
6512,3,62,5,5
6156,3,100,1,9
3830,3,75,6,9
...,...,...,...,...
504,2,60,1,5
3938,4,130,1,5
7180,4,80,2,5
4580,3,70,9,9


In [44]:
from sklearn.base import BaseEstimator, TransformerMixin

rooms_ix, size_ix, max_levels_ix = 0, 1, 3

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, max_rooms_levels = True):
        self.max_rooms_levels = max_rooms_levels
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        size_per_room = X[:, size_ix] / X[:, rooms_ix]
        size_per_level = X[:, size_ix] / X[:, max_levels_ix]
        if self.max_rooms_levels:
            max_rooms = X[:, rooms_ix] * X[:, max_levels_ix]
            return np.c_[X, size_per_level, size_per_room, max_rooms]
        else:
            return np.c_[X, size_per_level, size_per_room]

In [45]:
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

num_pipeline = Pipeline([
          ('imputer', SimpleImputer(strategy='median')),
          ('attribs_adder', CombinedAttributesAdder(max_rooms_levels = True)),
          ('std_scaler', StandardScaler())
])
num_pipeline.fit_transform(x_num)

array([[-0.58249813, -0.03491202, -1.20164792, ..., -0.04553003,
        -0.02558313,  0.14905332],
       [ 0.33511697, -0.02355325, -0.30629653, ..., -0.03361269,
        -0.03210901,  0.4253533 ],
       [ 0.33511697, -0.03550985,  0.58905486, ..., -0.03187926,
        -0.04255042, -0.12724666],
       ...,
       [ 1.25273206, -0.02474891, -0.75397223, ..., -0.02277874,
        -0.04359456,  0.33325331],
       [ 0.33511697, -0.03072721,  2.37975763, ..., -0.04356387,
        -0.03837386,  0.97795326],
       [ 0.33511697, -0.01889018, -1.20164792, ..., -0.03484957,
        -0.02803686,  0.70165328]])

In [46]:
from sklearn.compose import ColumnTransformer

num_attribs = list(x_num)
cat_attribs = ['district']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OrdinalEncoder(), cat_attribs)
])

In [47]:
x_prepared = full_pipeline.fit_transform(x_train)
x_prepared

array([[-0.58249813, -0.03491202, -1.20164792, ..., -0.02558313,
         0.14905332,  1.        ],
       [ 0.33511697, -0.02355325, -0.30629653, ..., -0.03210901,
         0.4253533 ,  4.        ],
       [ 0.33511697, -0.03550985,  0.58905486, ..., -0.04255042,
        -0.12724666,  2.        ],
       ...,
       [ 1.25273206, -0.02474891, -0.75397223, ..., -0.04359456,
         0.33325331,  5.        ],
       [ 0.33511697, -0.03072721,  2.37975763, ..., -0.03837386,
         0.97795326, 11.        ],
       [ 0.33511697, -0.01889018, -1.20164792, ..., -0.02803686,
         0.70165328, 11.        ]])

In [48]:
# Linear regression
from sklearn.linear_model import LinearRegression

LR_model = LinearRegression()
LR_model.fit(x_prepared, y)

In [50]:
x_test = test_set.drop('price', axis=1)
y_test = test_set['price'].copy()
x_test_prepared = full_pipeline.transform(x_test)
y_predicted = LR_model.predict(x_test_prepared)

In [52]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, y_predicted)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

44974.39807107103


In [53]:
# Model with RandomForest
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(x_prepared, y)

In [54]:
y_predicted = RF_model.predict(x_test_prepared)
lin_mse = mean_squared_error(y_test, y_predicted)
# RMSE
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

196468.044054134


In [55]:
test_data = x_train.sample(5)
test_label = y.loc[test_data.index]
test_data_full = full_pipeline.transform(test_data)
test_data_full

array([[-1.50011323e+00,  1.45742574e+01, -7.53972225e-01,
        -3.94665656e-01,  1.23235872e+01,  3.82972520e+01,
        -1.04824659e+00,  2.00000000e+00],
       [ 1.25273206e+00,  2.90557807e-02, -7.53972225e-01,
         3.71695357e-01, -1.83311381e-03, -8.35481046e-03,
         1.07005325e+00,  1.00000000e+00],
       [-5.82498130e-01, -3.84989986e-02,  2.37975763e+00,
         1.13805637e+00, -4.72153140e-02, -3.02817652e-02,
         1.49053321e-01,  8.00000000e+00],
       [ 1.25273206e+00, -1.63792921e-02,  1.41379163e-01,
        -3.94665656e-01, -1.57005660e-02, -3.81128205e-02,
         3.33253307e-01,  6.00000000e+00],
       [-5.82498130e-01, -3.67055089e-02, -7.53972225e-01,
        -1.14851491e-02, -3.79462705e-02, -2.79324486e-02,
        -4.03546637e-01,  1.10000000e+01]])

In [57]:
predicted_data = RF_model.predict(test_data_full)
pd.DataFrame({'Prediction': predicted_data, 'Real_value': test_label})

Unnamed: 0,Prediction,Real_value
2268,101235.0,24500
4382,226590.0,259000
4811,38415.876667,37000
332,69443.611,73000
6634,34542.956667,27000


In [60]:
# Cross validation
x = df.drop("price", axis=1)
y = df["price"].copy()

x_prepared = full_pipeline.transform(x)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

from sklearn.model_selection import cross_val_score
scores = cross_val_score(LR_model, x_prepared, y, scoring="neg_mean_squared_error", cv=10)
LR_rmse_scores = np.sqrt(-scores)
display_scores(LR_rmse_scores)

Scores: [  71317.884573     36007.09431166  143371.06897468   40402.44559507
   46923.06901934   41732.0070715    50720.95371657 1900718.56449044
  164396.0989488   649125.18745536]
Mean: 314471.43741564127
Std.dev: 557430.1858287377


In [61]:
import joblib

filename = 'RF_model.jbl'
joblib.dump(RF_model, filename)

['RF_model.jbl']

In [62]:
filename = 'pipeline.jbl'
joblib.dump(full_pipeline, filename)

['pipeline.jbl']

In [63]:
import joblib

filename = 'LR_model.jbl'
joblib.dump(LR_model, filename)

['LR_model.jbl']