In [75]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

In [88]:
realEstateData = pd.read_csv('WakeCountyHousing.csv',parse_dates=[3,4])
realEstateData = realEstateData.dropna()

In [51]:
realEstateData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308292 entries, 0 to 308291
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Real_Estate_Id      308292 non-null  int64         
 1   Deeded_Acreage      308292 non-null  float64       
 2   Total_Sale_Price    308292 non-null  int64         
 3   Total_Sale_Date     308292 non-null  datetime64[ns]
 4   Month_Year_of_Sale  308292 non-null  datetime64[ns]
 5   Year_of_Sale        308292 non-null  int64         
 6   Year_Built          308292 non-null  int64         
 7   Year_Remodeled      308292 non-null  int64         
 8   Heated_Area         308292 non-null  int64         
 9   Num_Stories         308292 non-null  object        
 10  Design_Style        308292 non-null  object        
 11  Bath                308275 non-null  object        
 12  Utilities           306324 non-null  object        
 13  Physical_City       308183 no

In [89]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("Init\n")
    
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        x_ = X.copy()
        #print("Transforming\n")
        #Ordinal Encoding of Num_Stories
        ordinal_encoder = OrdinalEncoder(categories = [['Other','One Story','Two Story']])
        ordinal_encoder.fit(x_[['Num_Stories']])
        x_['Num_Stories'] = ordinal_encoder.transform(x_[['Num_Stories']])
        #OneHotEncoding of Physical_City
        x_ = pd.get_dummies(x_,prefix=['City'], columns= ['Physical_City']).copy()
        return x_

In [90]:
fullPipeline = Pipeline(steps = [
                                ("CustomTransformer", CustomTransformer()),
                                ("LinearRegression", LinearRegression())
])

Init



In [91]:
#Splitting Data
train_set, test_set = train_test_split(realEstateData, test_size=0.2, random_state=42)

keepVar = ['Total_Sale_Price','Deeded_Acreage','Heated_Area','Num_Stories','Physical_City']
train_set = train_set[keepVar]
test_set = test_set[keepVar]

train_x = train_set.drop('Total_Sale_Price',axis=1)
train_y = train_set['Total_Sale_Price']
test_x = test_set.drop('Total_Sale_Price',axis=1)
test_y = test_set['Total_Sale_Price']

In [97]:
#Using Full Pipeline
fullPipeline.fit(train_x,train_y)
result_y = fullPipeline.predict(test_x)

In [99]:
#Very Bad Model
import sklearn.metrics as metrics
linRegMAE = metrics.mean_absolute_error(test_y, result_y)
linRegMSE = metrics.mean_squared_error(test_y, result_y)
print("MAE: ", linRegMAE)
print("MSE: ", linRegMSE)

MAE:  79191.21670887928
MSE:  15526359125.280104
