In [205]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression

from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression

import joblib

In [179]:
data = pd.read_csv('housing_prices_dataset.csv')

In [115]:
data

Unnamed: 0,Size,Bedrooms,Bathrooms,Neighborhood,YearBuilt,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,Price
0,10000.000000,,,Commercial,1973,,Medium,30.000000,5.488515,1.488980e+06
1,1930.867849,4.0,3.0,Industrial,1928,0.0,Medium,1.802602,7.003753,6.383428e+05
2,2323.844269,5.0,1.0,Commercial,2009,0.0,Medium,7.573310,3.090808,5.858642e+05
3,2761.514928,4.0,2.0,Industrial,1962,1.0,Low,2.761676,5.355583,7.148188e+05
4,1882.923313,3.0,2.0,Suburb,2001,0.0,Medium,5.537443,3.721960,5.637058e+05
...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,Industrial,1948,0.0,Medium,4.502521,4.562272,5.572435e+05
4996,2355.705290,4.0,2.0,Commercial,1936,0.0,Low,3.156012,4.543997,6.891917e+05
4997,3556.455101,2.0,1.0,Industrial,1930,0.0,Low,5.276355,6.338340,7.605303e+05
4998,2404.018095,2.0,1.0,Suburb,1930,1.0,Low,6.526272,6.879909,5.751748e+05


In [105]:
# Train  Neighbor(comm, downtown, industrial, rural, subarb)


# Prediction

# sample

# square, built, neighbor

# 122, 2020, Comm

# 122,2022, 1, 0, 0, 0, 0 

# 123, 2022, suburb

# 123, 2022, 0, 0, 0, 0, 1


In [117]:
def preprocess_data(data):
    data['HouseAge'] = 2024 - data['YearBuilt']
    data.drop(columns='YearBuilt', inplace = True)
    mapping = {'Low':1, 'Medium':2, 'High':3}
    data['LuxuryRating'] = data['LuxuryRating'].map(mapping)
    data = pd.get_dummies(data, dtype = 'int')
    return data
    

In [119]:
data

Unnamed: 0,Size,Bedrooms,Bathrooms,Neighborhood,YearBuilt,HasGarage,LuxuryRating,ProximityToCityCenter,Condition,Price
0,10000.000000,,,Commercial,1973,,Medium,30.000000,5.488515,1.488980e+06
1,1930.867849,4.0,3.0,Industrial,1928,0.0,Medium,1.802602,7.003753,6.383428e+05
2,2323.844269,5.0,1.0,Commercial,2009,0.0,Medium,7.573310,3.090808,5.858642e+05
3,2761.514928,4.0,2.0,Industrial,1962,1.0,Low,2.761676,5.355583,7.148188e+05
4,1882.923313,3.0,2.0,Suburb,2001,0.0,Medium,5.537443,3.721960,5.637058e+05
...,...,...,...,...,...,...,...,...,...,...
4995,1975.517485,2.0,,Industrial,1948,0.0,Medium,4.502521,4.562272,5.572435e+05
4996,2355.705290,4.0,2.0,Commercial,1936,0.0,Low,3.156012,4.543997,6.891917e+05
4997,3556.455101,2.0,1.0,Industrial,1930,0.0,Low,5.276355,6.338340,7.605303e+05
4998,2404.018095,2.0,1.0,Suburb,1930,1.0,Low,6.526272,6.879909,5.751748e+05


In [121]:
data = preprocess_data(data)

In [129]:
data_temporary_just_time_being = data.dropna()

In [131]:
X = data_temporary_just_time_being.drop(columns = 'Price')
y = data_temporary_just_time_being['Price']

In [133]:
X = X.dropna()

In [135]:
select_k_best = SelectKBest(f_regression, k= 5 )
X_select = select_k_best.fit_transform(X,y)
select_k_best.get_support()
selected_feature_names = X.columns[select_k_best.get_support()]
selected_feature_names

Index(['Size', 'Bedrooms', 'Bathrooms', 'LuxuryRating',
       'ProximityToCityCenter'],
      dtype='object')

In [137]:
selected_feature_names.values

array(['Size', 'Bedrooms', 'Bathrooms', 'LuxuryRating',
       'ProximityToCityCenter'], dtype=object)

In [139]:
features = selected_feature_names.values

In [151]:
transformer = Pipeline(steps = 
         [('imputer', SimpleImputer(strategy='median')),
          ('scaling', StandardScaler())
         ])

In [157]:
preprocessor = ColumnTransformer(transformers=
                                [(('num', transformer,features ))]
                                )

In [171]:
pipeline = Pipeline(steps = 
    [('preprocess',preprocessor),
     ('model', LinearRegression())
    ]
)

In [175]:
pipeline

In [181]:
data = pd.read_csv('housing_prices_dataset.csv')
data = preprocess_data(data)
X = data[features]
y = data['Price']

In [183]:
X

Unnamed: 0,Size,Bedrooms,Bathrooms,LuxuryRating,ProximityToCityCenter
0,10000.000000,,,2,30.000000
1,1930.867849,4.0,3.0,2,1.802602
2,2323.844269,5.0,1.0,2,7.573310
3,2761.514928,4.0,2.0,1,2.761676
4,1882.923313,3.0,2.0,2,5.537443
...,...,...,...,...,...
4995,1975.517485,2.0,,2,4.502521
4996,2355.705290,4.0,2.0,1,3.156012
4997,3556.455101,2.0,1.0,1,5.276355
4998,2404.018095,2.0,1.0,1,6.526272


In [185]:
y

0       1.488980e+06
1       6.383428e+05
2       5.858642e+05
3       7.148188e+05
4       5.637058e+05
            ...     
4995    5.572435e+05
4996    6.891917e+05
4997    7.605303e+05
4998    5.751748e+05
4999    4.319291e+05
Name: Price, Length: 5000, dtype: float64

In [187]:
pipeline.fit(X,y)

In [189]:
X

Unnamed: 0,Size,Bedrooms,Bathrooms,LuxuryRating,ProximityToCityCenter
0,10000.000000,,,2,30.000000
1,1930.867849,4.0,3.0,2,1.802602
2,2323.844269,5.0,1.0,2,7.573310
3,2761.514928,4.0,2.0,1,2.761676
4,1882.923313,3.0,2.0,2,5.537443
...,...,...,...,...,...
4995,1975.517485,2.0,,2,4.502521
4996,2355.705290,4.0,2.0,1,3.156012
4997,3556.455101,2.0,1.0,1,5.276355
4998,2404.018095,2.0,1.0,1,6.526272


In [197]:
predict_data_X = pd.DataFrame([[1700, 3, 1, 2, 30]], columns=features)

In [199]:
pipeline.predict(predict_data_X)

array([254025.48097306])

In [203]:
predict_data_X = pd.DataFrame([[1700, np.NaN, 1, 2, 30]], columns=features)
pipeline.predict(predict_data_X)

array([254025.48097306])

In [207]:
joblib.dump(pipeline, 'houseprice_pieline_lr.pkl')

['houseprice_pieline_lr.pkl']

In [None]:
1. Combining Preprocessin and modeling in one object 
2. Handle any number of columns
2. Prediction is very easy and straight forward