## Fire up Graphlab 

In [73]:
import graphlab

## Load the train dataset

In [74]:
housing_data = graphlab.SFrame("train.csv")

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,long,str,long,long,str,str,str,str,str,str,str,str,str,str,str,str,long,long,long,long,str,str,str,str,str,long,str,str,str,str,str,str,str,long,str,long,long,long,str,str,str,str,long,long,long,long,long,long,long,long,long,long,str,long,str,long,str,str,long,str,long,long,str,str,str,long,long,long,long,long,long,str,str,str,long,long,long,str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


## Preprocessing Data

In [75]:
## Remove 'Id' column from dataset
housing_data = housing_data.remove_column('Id')

## find the numerical and categorical features in data set.
numerical_fields = []
categorical_fields = []
feature_list = housing_data.column_names()
for feature in feature_list:
    data_type = str(housing_data[feature].dtype())
    if "int" in data_type:
        numerical_fields.append(feature)
    else:
        categorical_fields.append(feature)
        
print numerical_fields
print "\n No of Numerical features: " +  str(len(numerical_fields))
print "================================================"
print categorical_fields   
print "\n No of Categorical features: " + str(len(categorical_fields)) 

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']

 No of Numerical features: 37
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageTyp

In [76]:
from  sklearn.preprocessing import LabelEncoder

In [77]:
def pre_process_data():
    print (" Preprocessing the Data...")
    for col in categorical_fields:
        housing_data[col] = housing_data[col].fillna('default')

    for col in numerical_fields:
        housing_data[col] = housing_data[col].fillna(0)

    encode=LabelEncoder()
    for col in categorical_fields:
        housing_data[col]=encode.fit_transform(housing_data[col])
    print ("\n Finished Preprocessing data")
        
pre_process_data()

 Preprocessing the Data...

 Finished Preprocessing data


## Check still if any missing values are present in data set.

In [82]:
for varaible in housing_data.column_names():
    percent_missing_values = (float(housing_data[varaible].num_missing())/ len(housing_data[varaible])) * 100.0
    print ("{0}: \t{1}".format(varaible, percent_missing_values))

MSSubClass: 	0.0
MSZoning: 	0.0
LotFrontage: 	0.0
LotArea: 	0.0
Street: 	0.0
Alley: 	0.0
LotShape: 	0.0
LandContour: 	0.0
Utilities: 	0.0
LotConfig: 	0.0
LandSlope: 	0.0
Neighborhood: 	0.0
Condition1: 	0.0
Condition2: 	0.0
BldgType: 	0.0
HouseStyle: 	0.0
OverallQual: 	0.0
OverallCond: 	0.0
YearBuilt: 	0.0
YearRemodAdd: 	0.0
RoofStyle: 	0.0
RoofMatl: 	0.0
Exterior1st: 	0.0
Exterior2nd: 	0.0
MasVnrType: 	0.0
MasVnrArea: 	0.0
ExterQual: 	0.0
ExterCond: 	0.0
Foundation: 	0.0
BsmtQual: 	0.0
BsmtCond: 	0.0
BsmtExposure: 	0.0
BsmtFinType1: 	0.0
BsmtFinSF1: 	0.0
BsmtFinType2: 	0.0
BsmtFinSF2: 	0.0
BsmtUnfSF: 	0.0
TotalBsmtSF: 	0.0
Heating: 	0.0
HeatingQC: 	0.0
CentralAir: 	0.0
Electrical: 	0.0
1stFlrSF: 	0.0
2ndFlrSF: 	0.0
LowQualFinSF: 	0.0
GrLivArea: 	0.0
BsmtFullBath: 	0.0
BsmtHalfBath: 	0.0
FullBath: 	0.0
HalfBath: 	0.0
BedroomAbvGr: 	0.0
KitchenAbvGr: 	0.0
KitchenQual: 	0.0
TotRmsAbvGrd: 	0.0
Functional: 	0.0
Fireplaces: 	0.0
FireplaceQu: 	0.0
GarageType: 	0.0
GarageYrBlt: 	0.0
GarageFini

## Create a linear regression model with multiple features

In [85]:
feature_list = numerical_fields + categorical_fields
## remove target variable from feature_list
feature_list.remove('SalePrice')

multivariate_model = graphlab.linear_regression.create(housing_data, target = 'SalePrice', features = feature_list, 
                                                  validation_set = None)

## Now that we have fitted the model we can extract the regression weights (coefficients) as an SFrame as follows:

In [91]:
coefficients_summary = multivariate_model.get("coefficients")
coefficients_summary.print_rows(num_rows=80, num_columns=4) 

+---------------+-------+----------------+----------------+
|      name     | index |     value      |     stderr     |
+---------------+-------+----------------+----------------+
|  (intercept)  |  None | 4637.78349508  | 1302759.86858  |
|   MSSubClass  |  None | -105.519062454 | 44.7124348882  |
|  LotFrontage  |  None | -39.0240467333 | 27.4721745908  |
|    LotArea    |  None | 0.399225248866 | 0.104344977313 |
|  OverallQual  |  None | 10995.3131216  | 1168.52140742  |
|  OverallCond  |  None | 5218.30446198  | 1027.90822274  |
|   YearBuilt   |  None | 164.741556935  | 73.7658766599  |
|  YearRemodAdd |  None | 3.66414277928  | 65.9520344899  |
|   MasVnrArea  |  None | 32.7518244822  |  6.0290215815  |
|   BsmtFinSF1  |  None | 5.91269480169  |      nan       |
|   BsmtFinSF2  |  None | 9.41383284707  |      nan       |
|   BsmtUnfSF   |  None | -2.30667013513 |      nan       |
|  TotalBsmtSF  |  None |  1.118952686   |      nan       |
|    1stFlrSF   |  None | 28.0415796231 

## Calculate prediction values on the housing data

In [92]:
predicted_values = multivariate_model.predict(housing_data)

## Compute RSS


In [95]:
# Then compute the residuals/errors
residuals = predicted_values - housing_data['SalePrice']

# Then square and add them up
squared_residuals = residuals * residuals
RSS = squared_residuals.sum()

print "%.2f" % RSS   

1340824941036.59
