With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

** File descriptions **

- train.csv - the training set
- test.csv - the test set
- data_description.txt - full description of each column, originally prepared by Dean De Cock but lightly edited to match the column names used here
- sample_submission.csv - a benchmark submission from a linear regression on year and month of sale, lot square footage, and number of bedrooms

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing 
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
print(pd.get_option("display.max_columns"))
pd.options.display.max_columns = 999

# Load the housing dataset
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# data.describe()

999


In [None]:
# test.describe()

### Benchmark Linear Regression
A benchmark submission from a linear regression on year and month of sale, lot square footage, and number of bedrooms.

In [4]:
data.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500


In [5]:
lr = LinearRegression()
X = data[['YrSold', 'MoSold', 'LotArea', 'BedroomAbvGr']]
y = data[['SalePrice']]
X_test = test[['YrSold', 'MoSold', 'LotArea', 'BedroomAbvGr']]
lr.fit(X,y)

pred = pd.DataFrame(data=lr.predict(X_test))
# pred.describe()
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())

submission.to_csv("submission-lr.csv", sep=',', index=False)
# Score: 0.40890

                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  179183.918243
std     421.321334   16518.303051
min    1461.000000  135751.318893
25%    1825.500000  168703.011202
50%    2190.000000  179208.665698
75%    2554.500000  186789.409363
max    2919.000000  281643.976117


### Logistic Regression Model

In [6]:
log = LogisticRegression()
X = data[['YrSold', 'MoSold', 'LotArea', 'BedroomAbvGr']]
y = data[['SalePrice']]
X_test = test[['YrSold', 'MoSold', 'LotArea', 'BedroomAbvGr']]
log.fit(X,y)

pred = pd.DataFrame(data=log.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())

submission.to_csv("logistic.csv", sep=',', index=False)
# Score: 0.45531 (worse than benchmark)

  y = column_or_1d(y, warn=True)


                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  142395.202879
std     421.321334   32875.305616
min    1461.000000   60000.000000
25%    1825.500000  135000.000000
50%    2190.000000  140000.000000
75%    2554.500000  140000.000000
max    2919.000000  755000.000000


### Data Exploration and Feature Normalization

In [None]:
# data.head(5)

### Categorical Features
MSSubClass, MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConfig, LandSlope, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, RoofStyle, RoofMatl, Exterior1st, Exterior2nd, MasVnrType, ExterQual, ExterCond, Foundation, BsmtQual, BsmtExposure, BsmtFinType1, BsmtFinType2, Heating, HeatingQC, CentralAir, Electrical, KitchenQual, Functional, FireplaceQu, GarageType, GarageFinish, GarageQual, GarageCond, PavedDrive, PoolQC, Fence, MiscFeature, SaleType, SaleCondition

### Normalize Features

In [None]:
# labels = data['SalePrice']
# df_norm = preprocessing.normalize(data.drop(['Id', 'SalePrice'], axis=1), axis=0)
# df_norm = pd.DataFrame(df_norm, columns=['MSSubClass','MSZoning','LotFrontage','LotArea','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','OverallQual','OverallCond','YearBuilt','YearRemodAdd','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','MasVnrArea','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','Heating','HeatingQC','CentralAir','Electrical','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu','GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond','PavedDrive','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','PoolQC','Fence','MiscFeature','MiscVal','MoSold','YrSold','SaleType','SaleCondition'])
# df_norm.describe()

### Feature Correlation

In [None]:
# df_norm.corr()

### Correlations with SalePrice variable

In [7]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Fill up NaN's in train and test datasets
for df in [data, test]:
    df['MSZoning'] = df['MSZoning'].fillna(value="NoZoneClass")
    df['Alley'] = df['Alley'].fillna(value="NoAccess")
    df['Utilities'] = df['Utilities'].fillna(value="NoUtilities")
    df['Exterior1st'] = df['Exterior1st'].fillna(value="NoExterior1st")
    df['Exterior2nd'] = df['Exterior2nd'].fillna(value="NoExterior2nd")
    df['MasVnrType'] = df['MasVnrType'].fillna(value="0")
    df['BsmtQual'] = df['BsmtQual'].fillna(value="NoBasement")
    df['BsmtCond'] = df['BsmtCond'].fillna(value="NoBasement")
    df['BsmtExposure'] = df['BsmtExposure'].fillna(value="NoBasement")
    df['BsmtFinType1'] = df['BsmtFinType1'].fillna(value="NoBasement")
    df['BsmtFinSF1'] = df['BsmtFinSF1'].fillna(value=0.0)
    df['BsmtFinType2'] = df['BsmtFinType2'].fillna(value="NoBasement")
    df['BsmtFinSF2'] = df['BsmtFinSF2'].fillna(value=0.0)
    df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(value=0.0)
    df['TotalBsmtSF'] = df['TotalBsmtSF'].fillna(value=0.0)
    df['Electrical'] = df['Electrical'].fillna(value="0")
    df['BsmtFullBath'] = df['BsmtFullBath'].fillna(value=0.0)
    df['BsmtHalfBath'] = df['BsmtHalfBath'].fillna(value=0.0)
    df['KitchenQual'] = df['KitchenQual'].fillna(value="NoKitchenQual")
    df['Functional'] = df['Functional'].fillna(value="NoHomeFunctionality")
    df['FireplaceQu'] = df['FireplaceQu'].fillna(value="NoFireplace")
    df['GarageType'] = df['GarageType'].fillna(value="NoGarage")
    df['GarageFinish'] = df['GarageFinish'].fillna(value="NoGarage")
    df['GarageCars'] = df['GarageCars'].fillna(value=0.0)
    df['GarageArea'] = df['GarageArea'].fillna(value=0.0)
    df['GarageQual'] = df['GarageQual'].fillna(value="NoGarage")
    df['GarageCond'] = df['GarageCond'].fillna(value="NoGarage")
    df['PoolQC'] = df['PoolQC'].fillna(value="NoPool")
    df['Fence'] = df['Fence'].fillna(value="NoFence")
    df['MiscFeature'] = df['MiscFeature'].fillna(value="NoMiscFeatures")
    df['LotFrontage'] = df['LotFrontage'].fillna(value=0.0)
    df['MasVnrArea'] = df['MasVnrArea'].fillna(value=0.0)
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(value=0.0)
    df['SaleType'] = df['SaleType'].fillna(value="NoSaleType")

    for feature in ['MSZoning','Street', 'Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','PoolQC','Fence','MiscFeature','SaleType','SaleCondition']:
        le = preprocessing.LabelEncoder()
        le.fit(df[feature])
        df[feature] = le.transform(df[feature])

In [8]:
cor = data.corr()['SalePrice']
abs_cor = abs(cor)

# 10 most highly correlated features with "SalePrice" target variable
print(abs_cor.sort_values().tail(11))

FullBath       0.560664
KitchenQual    0.589189
BsmtQual       0.593734
1stFlrSF       0.605852
TotalBsmtSF    0.613581
GarageArea     0.623431
ExterQual      0.636884
GarageCars     0.640409
GrLivArea      0.708624
OverallQual    0.790982
SalePrice      1.000000
Name: SalePrice, dtype: float64


### Regression Model on 3 features with the highest correlation with SalePrice

In [9]:
log = LogisticRegression()
X = data[['OverallQual', 'GrLivArea', 'GarageCars']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars']]
log.fit(X,y)

pred = pd.DataFrame(data=log.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())

submission.to_csv("logistic-3.csv", sep=',', index=False)
# Score: 0.40495 (logistic-5 is better)

                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  146982.864976
std     421.321334   22758.363777
min    1461.000000   84500.000000
25%    1825.500000  140000.000000
50%    2190.000000  140000.000000
75%    2554.500000  140000.000000
max    2919.000000  256000.000000


### Linear Regression Model with features mostly correlated with SalePrice
 ** Somehow predicted negative price!!! **

In [10]:
lr = LinearRegression()
X = data[['OverallQual', 'GrLivArea', 'GarageCars']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars']]
lr.fit(X,y)

pred = pd.DataFrame(data=lr.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
submission.to_csv("linear-3.csv", sep=',', index=False)
# Score: not valid!

                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  178827.793489
std     421.321334   68259.964647
min    1461.000000  -14268.988673
25%    1825.500000  127642.299964
50%    2190.000000  173284.341180
75%    2554.500000  221520.313313
max    2919.000000  494297.752935


### Logistic regression with more variables

In [11]:
log = LogisticRegression()
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea']]
log.fit(X,y)

pred = pd.DataFrame(data=log.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("logistic-5.csv", sep=',', index=False)
# Score: 0.32159 (better than benchmark)

                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  157694.729952
std     421.321334   44171.202786
min    1461.000000   35311.000000
25%    1825.500000  140000.000000
50%    2190.000000  140000.000000
75%    2554.500000  180000.000000
max    2919.000000  385000.000000

Train dataset statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [12]:
log = LogisticRegression()
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]
log.fit(X,y)

pred = pd.DataFrame(data=log.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("logistic-10.csv", sep=',', index=False)
# Score: 0.29927 (better but not as good as decision tree)

                Id     SalePrice
count  1459.000000    1459.00000
mean   2190.000000  170238.43523
std     421.321334   61250.37737
min    1461.000000   34900.00000
25%    1825.500000  135000.00000
50%    2190.000000  160000.00000
75%    2554.500000  185000.00000
max    2919.000000  611657.00000

Train dataset statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [None]:
print(test['SaleCondition'].isnull().sum())
# test['MSZoning']
# print(data['SalePrice'].describe())

### Feature Engineering
Use PCA to develop new features from combinations of raw features (like in market segmentation problem). Use these new features as input to logistic regression, decision tree or neural network models.

In [None]:
# Apply PCA by fitting the good data with the same number of dimensions as features
pca = PCA(n_components = 6).fit(data)

print(pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1])
print(pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1] + pca.explained_variance_ratio_[2] + pca.explained_variance_ratio_[3])

In [None]:
#  Transform the data using the PCA fit above
reduced_data = pca.transform(data)
pca_test = PCA(n_components = 6).fit(test)
reduced_test = pca_test.transform(test)
# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2','Dimension 3','Dimension 4','Dimension 5','Dimension 6'])
y = data[['SalePrice']]
log = LogisticRegression()
log.fit(reduced_data,y)
pred = pd.DataFrame(data=log.predict(reduced_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("logistic-pca.csv", sep=',', index=False)
# Score: 0.46361 (not good)

In [13]:
from sklearn.metrics import r2_score as r2
def performance_metric(y_true, y_predict):
    """ Calculates and returns the performance score between 
        true and predicted values based on the metric chosen. """
    
    # Calculates the performance score between 'y_true' and 'y_predict'
    score = r2(y_true, y_predict)
    return score

In [14]:
# grid search cv
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits=10, test_size = 0.20, random_state = 0)
    # Creates a decision tree regressor object
    regressor = DecisionTreeRegressor()
    # Creates a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': range(1, 11)}
    # Transforms 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)
    # Creates the grid search object
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)
    # Return the optimal model after fitting the data
    return grid.best_estimator_

In [None]:
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]
reg = fit_model(X, y)

pred = pd.DataFrame(data=reg.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("regressor.csv", sep=',', index=False)
# Score: 0.19717 (best so far!) 

Foundation      0.382479
BsmtFinSF1      0.386420
HeatingQC       0.400178
GarageType      0.415283
GarageFinish    0.425684
Fireplaces      0.466929
MasVnrArea      0.472614
YearRemodAdd    0.507101
YearBuilt       0.522897
TotRmsAbvGrd    0.533723
FullBath        0.560664
KitchenQual     0.589189
BsmtQual        0.593734
1stFlrSF        0.605852
TotalBsmtSF     0.613581
GarageArea      0.623431
ExterQual       0.636884
GarageCars      0.640409
GrLivArea       0.708624
OverallQual     0.790982

In [15]:
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual',
          'FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','MasVnrArea']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','MasVnrArea']]
reg = fit_model(X, y)

pred = pd.DataFrame(data=reg.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("regressor-14.csv", sep=',', index=False)
# Score: 0.19717 (best so far!) 

                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  179276.812133
std     421.321334   73579.916881
min    1461.000000   48652.750000
25%    1825.500000  127820.205882
50%    2190.000000  161169.031250
75%    2554.500000  201156.913793
max    2919.000000  547285.400000

Train dataset statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


### PCA on 10 dimensions

In [None]:
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]
pca = PCA(n_components = 4).fit(X)

print(pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1])
print(pca.explained_variance_ratio_[0] + pca.explained_variance_ratio_[1] + pca.explained_variance_ratio_[2] + + pca.explained_variance_ratio_[3] )

#  Transform the data using the PCA fit above
reduced_data = pca.transform(X)
# pca_test = PCA(n_components = 4).fit(X_test)
reduced_test = pca.transform(X_test)

# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2','Dimension 3','Dimension 4'])

# log = LogisticRegression()
# log.fit(reduced_data,y)

reg = fit_model(reduced_data, y)

pred = pd.DataFrame(data=reg.predict(reduced_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("reg-pca10.csv", sep=',', index=False)
# Score: 0.23471

In [None]:
# loop over classifiers
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath']]

print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))

names = ["NearestNeighbors.csv", "LinearSVM.csv", "RBF-SVM.csv", 
         "DecisionTree.csv", "RandomForest.csv", "NeuralNet.csv", 
         "AdaBoost.csv", "NaiveBayes.csv", "QDA.csv", "GaussianProcess.csv"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    QuadraticDiscriminantAnalysis(),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)]

# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X , y)
    pred = pd.DataFrame(data=clf.predict(X_test))
    submission = test[['Id']]
    submission = submission.join(pred)
    submission.columns = ['Id', 'SalePrice']
    print(submission.describe())
    submission.to_csv(name, sep=',', index=False)
# Scores: 
# Nearest Neighbors - 0.32036
# Linear SVM - 0.26192
# RBF-SVM - 0.44673
# DecisionTree - 0.28187
# RandomForest - 0.24847
# NeuralNet - 0.48200
# AdaBoost - 0.32510
# NaiveBayes - 0.24422
# QDA
# GaussianProcess



In [18]:
from sklearn.linear_model import BayesianRidge
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual',
          'FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','MasVnrArea']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','MasVnrArea']]

clf = BayesianRidge(compute_score=True)
clf.fit(X, y)
pred = pd.DataFrame(data=reg.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("bayesian-ridge.csv", sep=',', index=False)
# Score: 0.20269

                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  179276.812133
std     421.321334   73579.916881
min    1461.000000   48652.750000
25%    1825.500000  127820.205882
50%    2190.000000  161169.031250
75%    2554.500000  201156.913793
max    2919.000000  547285.400000

Train dataset statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [27]:
# grid search cv
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits=10, test_size = 0.20, random_state = 0)
    # Creates a decision tree regressor object
    regressor = BayesianRidge(compute_score=True)
    # Creates a dictionary for the parameter 
    params = {'alpha_1': range(200) }
    # Transforms 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)
    # Creates the grid search object
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)
    # Return the optimal model after fitting the data
    print(grid.best_estimator_)
    return grid.best_estimator_

In [28]:
reg = fit_model(X, y)

pred = pd.DataFrame(data=reg.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("bayesian-grid.csv", sep=',', index=False)
# Score: 0.23404

BayesianRidge(alpha_1=199, alpha_2=1e-06, compute_score=True, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False)
                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  173433.654005
std     421.321334   74749.538592
min    1461.000000    4775.342571
25%    1825.500000  117934.829467
50%    2190.000000  157247.378564
75%    2554.500000  217363.053192
max    2919.000000  636577.240892

Train dataset statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [35]:
from sklearn.tree import DecisionTreeRegressor
# grid search cv
def fit_model(X, y):
    """ Performs grid search over the 'max_depth' parameter for a 
        decision tree regressor trained on the input data [X, y]. """
    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(n_splits=20, test_size = 0.20, random_state = 0)
    # Creates a decision tree regressor object
    regressor = DecisionTreeRegressor()
    # Creates a dictionary for the parameter 
    params = {'max_depth': range(1,12), 'min_samples_split': range(2,20) }
    # Transforms 'performance_metric' into a scoring function using 'make_scorer' 
    scoring_fnc = make_scorer(performance_metric)
    # Creates the grid search object
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    # Fit the grid search object to the data to compute the optimal model
    grid = grid.fit(X, y)
    # Return the optimal model after fitting the data
    print(grid.best_estimator_)
    return grid.best_estimator_

In [36]:
X = data[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual',
          'FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','MasVnrArea']]
y = data[['SalePrice']]
X_test = test[['OverallQual', 'GrLivArea', 'GarageCars','ExterQual','GarageArea','TotalBsmtSF','1stFlrSF','BsmtQual','KitchenQual','FullBath','TotRmsAbvGrd','YearBuilt','YearRemodAdd','MasVnrArea']]
reg = fit_model(X, y)
pred = pd.DataFrame(data=reg.predict(X_test))
submission = test[['Id']]
submission = submission.join(pred)
submission.columns = ['Id', 'SalePrice']
print(submission.describe())
print("\nTrain dataset statistics:\n{}".format(data['SalePrice'].describe()))
submission.to_csv("decision-tree-grid.csv", sep=',', index=False)
# Score: 0.19998

DecisionTreeRegressor(criterion='mse', max_depth=8, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=15,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')
                Id      SalePrice
count  1459.000000    1459.000000
mean   2190.000000  179543.504134
std     421.321334   75219.955448
min    1461.000000   48652.750000
25%    1825.500000  126314.757576
50%    2190.000000  164643.750000
75%    2554.500000  206700.000000
max    2919.000000  597000.000000

Train dataset statistics:
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64
