# Prediction of housing prices using linear regression

In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression

## Reading training set of data

In [2]:
training_set_path = '../dataset/train.csv'
training_set = pd.read_csv(training_set_path)
training_set.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


## Data Wrangling

### Cleaning

In [3]:
# Dropping problematic column(s)
training_set.drop(labels=['Alley'], axis=1, inplace=True)

### Encoding Categorical Fetures

In [4]:
# !!! Nominal features are treated as Orinal !!!
encoder = OrdinalEncoder()

# Finding categorical columns and features
categorical_columns = [column for column in training_set if isinstance(training_set[column][0], str)]
categorical_features = training_set[categorical_columns]

# Encoding categorical features into numerical value
encoder.fit(categorical_features) 
training_set.loc[:, categorical_columns] = encoder.transform(categorical_features)
training_set

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3.0,65.0,8450,1.0,3.0,3.0,0.0,4.0,...,0,,,,0,2,2008,8.0,4.0,208500
1,2,20,3.0,80.0,9600,1.0,3.0,3.0,0.0,2.0,...,0,,,,0,5,2007,8.0,4.0,181500
2,3,60,3.0,68.0,11250,1.0,0.0,3.0,0.0,4.0,...,0,,,,0,9,2008,8.0,4.0,223500
3,4,70,3.0,60.0,9550,1.0,0.0,3.0,0.0,0.0,...,0,,,,0,2,2006,8.0,0.0,140000
4,5,60,3.0,84.0,14260,1.0,0.0,3.0,0.0,2.0,...,0,,,,0,12,2008,8.0,4.0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3.0,62.0,7917,1.0,3.0,3.0,0.0,4.0,...,0,,,,0,8,2007,8.0,4.0,175000
1456,1457,20,3.0,85.0,13175,1.0,3.0,3.0,0.0,4.0,...,0,,MnPrv,,0,2,2010,8.0,4.0,210000
1457,1458,70,3.0,66.0,9042,1.0,3.0,3.0,0.0,4.0,...,0,,GdPrv,Shed,2500,5,2010,8.0,4.0,266500
1458,1459,20,3.0,68.0,9717,1.0,3.0,3.0,0.0,4.0,...,0,,,,0,4,2010,8.0,4.0,142125


## Building a Linear Regression Model

### Selecting Features

In [5]:
# Cell above shown columns[0]=='id' and columns[-1]=='SalePrice'
feature_columns = training_set.columns[1:-1]  # from [1] to [-2]
x = training_set[feature_columns]
x

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3.0,65.0,8450,1.0,3.0,3.0,0.0,4.0,0.0,...,0,0,,,,0,2,2008,8.0,4.0
1,20,3.0,80.0,9600,1.0,3.0,3.0,0.0,2.0,0.0,...,0,0,,,,0,5,2007,8.0,4.0
2,60,3.0,68.0,11250,1.0,0.0,3.0,0.0,4.0,0.0,...,0,0,,,,0,9,2008,8.0,4.0
3,70,3.0,60.0,9550,1.0,0.0,3.0,0.0,0.0,0.0,...,0,0,,,,0,2,2006,8.0,0.0
4,60,3.0,84.0,14260,1.0,0.0,3.0,0.0,2.0,0.0,...,0,0,,,,0,12,2008,8.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,3.0,62.0,7917,1.0,3.0,3.0,0.0,4.0,0.0,...,0,0,,,,0,8,2007,8.0,4.0
1456,20,3.0,85.0,13175,1.0,3.0,3.0,0.0,4.0,0.0,...,0,0,,MnPrv,,0,2,2010,8.0,4.0
1457,70,3.0,66.0,9042,1.0,3.0,3.0,0.0,4.0,0.0,...,0,0,,GdPrv,Shed,2500,5,2010,8.0,4.0
1458,20,3.0,68.0,9717,1.0,3.0,3.0,0.0,4.0,0.0,...,0,0,,,,0,4,2010,8.0,4.0


### Selecting Prediction Target

In [6]:
y = training_set.SalePrice

### Fitting a linear model

In [7]:
model = LinearRegression().fit(x, y)

ValueError: could not convert string to float: 'TA'