In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("train.csv")
df.count()

Id               1460
MSSubClass       1460
MSZoning         1460
LotFrontage      1201
LotArea          1460
                 ... 
MoSold           1460
YrSold           1460
SaleType         1460
SaleCondition    1460
SalePrice        1460
Length: 81, dtype: int64

In [2]:
garage = df[['GarageType','GarageArea','SalePrice']]
garage.head()
garage.GarageType.unique()

array(['Attchd', 'Detchd', 'BuiltIn', 'CarPort', nan, 'Basment', '2Types'],
      dtype=object)

In [3]:
dummies = pd.get_dummies(garage.GarageType)
dummies.head()

Unnamed: 0,2Types,Attchd,Basment,BuiltIn,CarPort,Detchd
0,0,1,0,0,0,0
1,0,1,0,0,0,0
2,0,1,0,0,0,0
3,0,0,0,0,0,1
4,0,1,0,0,0,0


In [4]:
merged = pd.concat([garage,dummies], axis = 'columns')
merged.head()

Unnamed: 0,GarageType,GarageArea,SalePrice,2Types,Attchd,Basment,BuiltIn,CarPort,Detchd
0,Attchd,548,208500,0,1,0,0,0,0
1,Attchd,460,181500,0,1,0,0,0,0
2,Attchd,608,223500,0,1,0,0,0,0
3,Detchd,642,140000,0,0,0,0,0,1
4,Attchd,836,250000,0,1,0,0,0,0


In [5]:
final = merged.drop(['GarageType'], axis = 'columns')
final.head()

Unnamed: 0,GarageArea,SalePrice,2Types,Attchd,Basment,BuiltIn,CarPort,Detchd
0,548,208500,0,1,0,0,0,0
1,460,181500,0,1,0,0,0,0
2,608,223500,0,1,0,0,0,0
3,642,140000,0,0,0,0,0,1
4,836,250000,0,1,0,0,0,0


In [6]:
final = final.drop(['Detchd'], axis = 'columns')
final.head()

Unnamed: 0,GarageArea,SalePrice,2Types,Attchd,Basment,BuiltIn,CarPort
0,548,208500,0,1,0,0,0
1,460,181500,0,1,0,0,0
2,608,223500,0,1,0,0,0
3,642,140000,0,0,0,0,0
4,836,250000,0,1,0,0,0


In [7]:
X = final.drop('SalePrice', axis = 'columns')
X.head()

Unnamed: 0,GarageArea,2Types,Attchd,Basment,BuiltIn,CarPort
0,548,0,1,0,0,0
1,460,0,1,0,0,0
2,608,0,1,0,0,0
3,642,0,0,0,0,0
4,836,0,1,0,0,0


In [8]:
y = final.SalePrice

In [9]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [10]:
model.fit(X,y)

LinearRegression()

In [11]:
model.predict(X)

array([207495.28466946, 190195.60249096, 219290.52251843, ...,
       149305.44461452, 146946.39704472, 154023.53975411])

In [12]:
model.score(X,y)

0.47491140849568014

In [13]:
model.predict([[550,0,0,0,0,0]]) # 550 sqr for 2Types

array([167496.81271733])

In [14]:
model.predict([[460,0,0,0,0,1]]) # 460 sqr for CarPort

array([111665.8676893])

In [15]:
model.predict([[600,0,0,0,1,0]]) # 600 sqr for BuiltIn

array([258763.90666226])

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
le = LabelEncoder()

In [27]:
dfle = garage
dfle.GarageType.dtypes
# dfle.GarageType = le.fit_transform(dfle.GarageType.astype('int64'))
# dfle
# .astype('str')

dtype('int64')

In [19]:
X = dfle[['GarageType','GarageArea']].values
X

array([[  1, 548],
       [  1, 460],
       [  1, 608],
       ...,
       [  1, 252],
       [  1, 240],
       [  1, 276]])

In [20]:
dfle.GarageType.unique()

array([1, 5, 3, 4, 6, 2, 0])

In [21]:
y = dfle.SalePrice.values
y

array([208500, 181500, 223500, ..., 266500, 142125, 147500])

In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('GarageType', OneHotEncoder(), [0])], remainder = 'passthrough')

In [23]:
X = ct.fit_transform(X)
X

#.toarray()

<1460x8 sparse matrix of type '<class 'numpy.float64'>'
	with 2839 stored elements in Compressed Sparse Row format>

In [24]:
X = X[:,1:]
X

<1460x7 sparse matrix of type '<class 'numpy.float64'>'
	with 2833 stored elements in Compressed Sparse Row format>

In [25]:
model.fit(X,y)

LinearRegression()

In [26]:
model.predict([[0,0,0,0,0,1,550]]) # 550 sqr for 2Types

array([230777.90539046])