In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import helper

In [2]:
# importing data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0, low_memory = False)

# data processing
train, test = helper.data_processing_wrapper(housing, 
                                             num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID = True
                                        )

In [3]:
categorical = train.select_dtypes(['O']).columns.to_list()

In [4]:
train['LogSalePrice'] = np.log(train['SalePrice'])
test['LogSalePrice'] = np.log(test['SalePrice'])

In [5]:
X_train = train.drop(['SalePrice', 'LogSalePrice', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_train = train['LogSalePrice']

X_test = test.drop(['SalePrice', 'LogSalePrice', 'TotalBsmtSF', 'GrLivArea'], axis=1)
y_test = test['LogSalePrice']

In [6]:
pipe = Pipeline([('transformer', ColumnTransformer([("Cat", OneHotEncoder(handle_unknown='ignore'), categorical)], 
                                            remainder='passthrough')),
                 ('scaler', StandardScaler(with_mean=False)),
                 ('ols', LinearRegression())])

pipe.fit(X_train, y_train)

print(f'The train score is {pipe.score(X_train, y_train)}')
print(f'The test score is {pipe.score(X_test, y_test)}')

The train score is 0.9581621720589614
The test score is 0.9066004895249129
