In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
from matplotlib.gridspec import GridSpec
import seaborn as sns
from scipy import stats

from sklearn. impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector,make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
import sklearn.metrics as metrics
data = pd.read_csv("rawData1.csv")

In [33]:
data1 = data.copy()
data1["price"] = data1["price"].replace("Thỏa thuận", "0")
data1['number_bedroom'] = data1['number_bedroom'].replace(np.nan, np.nanmedian(data1['number_bedroom']))
data1['number_wc'] = data1['number_wc'].replace(np.nan, np.nanmedian(data1['number_wc']))
data1['number_floor'] = data1['number_floor'].replace(np.nan, np.nanmedian(data1['number_floor']))
data1['entrance'] = data1['entrance'].replace(np.nan, np.nanmedian(data1['entrance']))
data1['facade'] = data1['facade'].replace(np.nan, np.nanmedian(data1['facade']))
data1['number_parking'] = data1['number_parking'].replace(np.nan, np.nanmedian(data1['number_parking']))
# print(data1.isna().sum())

X = data1[["area", "number_bedroom", "number_wc", "id_estate", "address"]]
y = data1["price"].astype(float)


In [34]:
imp_median = SimpleImputer(strategy='median', add_indicator=True)
scaler = StandardScaler()
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder(handle_unknown='ignore')
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')
preprocessor = make_column_transformer(
    (make_pipeline(imp_median, scaler), num_cols),
    (make_pipeline(imp_constant, ohe), cat_cols)
)

In [35]:
def pipeline_model(X, y, data):
    pipe = make_pipeline(preprocessor, LinearRegression())

    print(cross_val_score(pipe, X, y).mean())

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                        random_state=32)

    pipe.fit(X_train, y_train)

    predict = pipe.predict(X_test)
    print(f'MSE             : {metrics.mean_squared_error(y_test, predict)}\n'
          f'\nRMSE            : '
          f'{np.sqrt(metrics.mean_squared_error(y_test, predict))}\n'
          f'MAE             : {metrics.mean_absolute_error(y_test, predict)}\n'
          #f'Mean abs % error: '
          #f'{metrics.mean_absolute_percentage_error(y_test, predict)}\n'
          f'\n'
          f'Score (train)   : {pipe.score(X_train, y_train)}\n'
          f'Score (test)    : {pipe.score(X_test, y_test)}\n')
    comparison_of_results(X_test, y_test, pipe)
    visualize_model_results(data, pipe)

def comparison_of_results(X_test, y_test, model, times=5):
    for i in range(times):
        rnd = np.random.randint(0, y_test.shape[0] - 1)
        real = y_test.iloc[rnd]
        pred = int(model.predict(X_test.iloc[rnd].to_frame().T)[0])
        print(f'Real Value      ----->>>>> {real} $\n'
              f'Predicted Value ----->>>>> {pred} $')
        print()


def visualize_model_results(data, model):
    fig = plt.figure(figsize=(17, 10))
    data = data.sort_values(by=['price'])
    X = data.drop('price', axis=1)
    y = data.price.astype(int)
    
    plt.scatter(range(X.shape[0]), y, color='red', label='Real')
    plt.scatter(range(X.shape[0]), model.predict(X), marker='.', label='Predict')

    plt.legend(loc=2, prop={'size': 25})

In [36]:
pipeline_model(X, y, data1)

-949.9100311343238
MSE             : 160313747562.18756

RMSE            : 400391.99238020176
MAE             : 58646.97481020339

Score (train)   : 0.5067056808464342
Score (test)    : -0.06882970599598992

Real Value      ----->>>>> 3800.0 $
Predicted Value ----->>>>> -14131 $

Real Value      ----->>>>> 2600.0 $
Predicted Value ----->>>>> 7273 $

Real Value      ----->>>>> 2250.0 $
Predicted Value ----->>>>> 2153 $

Real Value      ----->>>>> 1800.0 $
Predicted Value ----->>>>> 6670 $

Real Value      ----->>>>> 1890.0 $
Predicted Value ----->>>>> 37786 $



ValueError: invalid literal for int() with base 10: '1000.0'

<Figure size 1224x720 with 0 Axes>