In [51]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import skew
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

# Definitions
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline
#njobs = 4

In [52]:
# Get data
train = pd.read_csv("train.csv")
print("train : " + str(train.shape))

train : (1460, 81)


In [53]:
train.head()

Unnamed: 0,TrainId,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [54]:
# Check for duplicates   
if len(train.TrainId) == len(set(train.TrainId)):
    print "no duplicates"
else:
    print "duplicates"
train = train.drop("TrainId", axis = 1)

no duplicates


In [55]:
train = train.ix[~((train.GrLivArea>4000) & (train.SalePrice<300000))]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


In [56]:
train_labels = train.SalePrice
train_features = train.drop(['SalePrice'],axis=1)

In [57]:
train_features.Alley = train_features.Alley.fillna("None")
train_features.BedroomAbvGr = train_features.BedroomAbvGr.fillna(0)
train_features.BsmtQual = train_features.BsmtQual.fillna("No")
train_features.BsmtCond = train_features.BsmtCond.fillna("No")
train_features.BsmtExposure = train_features.BsmtExposure.fillna("No")
train_features.BsmtFinType1 = train_features.BsmtFinType1.fillna("No")
train_features.BsmtFinType2 = train_features.BsmtFinType2.fillna("No")
train_features.BsmtFullBath = train_features.BsmtFullBath.fillna(0)
train_features.BsmtHalfBath = train_features.BsmtHalfBath.fillna(0)
train_features.BsmtUnfSF = train_features.BsmtUnfSF.fillna(0)
train_features.CentralAir = train_features.CentralAir.fillna("N")
train_features.Condition1 = train_features.Condition1.fillna("Norm")
train_features.Condition2 = train_features.Condition2.fillna("Norm")
train_features.EnclosedPorch = train_features.EnclosedPorch.fillna(0)
train_features.ExterCond = train_features.ExterCond.fillna("TA")
train_features.ExterQual = train_features.ExterQual.fillna("TA")
train_features.Fence = train_features.Fence.fillna("NA")
train_features.FireplaceQu = train_features.FireplaceQu.fillna("No")
train_features.Fireplaces = train_features.Fireplaces.fillna(0)
train_features.Functional = train_features.Functional.fillna("Typ")
train_features.GarageType = train_features.GarageType.fillna("NA")
train_features.GarageFinish = train_features.GarageFinish.fillna("NA")
train_features.GarageQual = train_features.GarageQual.fillna("No")
train_features.GarageCond = train_features.GarageCond.fillna("No")
train_features.GarageArea = train_features.GarageArea.fillna(0)
train_features.GarageCars = train_features.GarageCars.fillna(0)
train_features.HalfBath = train_features.HalfBath.fillna(0)
train_features.HeatingQC = train_features.HeatingQC.fillna("TA")
train_features.KitchenAbvGr = train_features.KitchenAbvGr.fillna(0)
train_features.KitchenQual = train_features.KitchenQual.fillna("TA")
train_features.LotShape = train_features.LotShape.fillna("Reg")
train_features.MasVnrType = train_features.MasVnrType.fillna("None")
train_features.MasVnrArea = train_features.MasVnrArea.fillna(0)
train_features.MiscFeature = train_features.MiscFeature.fillna("NA")
train_features.MiscVal = train_features.MiscVal.fillna(0)
train_features.OpenPorchSF = train_features.OpenPorchSF.fillna(0)
train_features.PavedDrive = train_features.PavedDrive.fillna("N")
train_features.PoolQC = train_features.PoolQC.fillna("No")
train_features.PoolArea = train_features.PoolArea.fillna(0)
train_features.SaleCondition = train_features.SaleCondition.fillna("Normal")
train_features.ScreenPorch = train_features.ScreenPorch.fillna(0)
train_features.TotRmsAbvGrd = train_features.TotRmsAbvGrd.fillna(0)
train_features.Utilities = train_features.Utilities.fillna("AllPub")
train_features.WoodDeckSF = train_features.WoodDeckSF.fillna(0)

In [58]:
train_features.Alley = train_features.Alley.replace(["None","Grvl","Pave"],[0,1,2])
train_features.BsmtQual = train_features.BsmtQual.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
train_features.BsmtCond = train_features.BsmtCond.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
train_features.BsmtExposure = train_features.BsmtExposure.replace(["No","Mn","Av","Gd"],[0,1,2,3])
train_features.BsmtFinType1 = train_features.BsmtFinType1.replace(["No" ,"Unf","LwQ","Rec","BLQ","ALQ","GLQ"],[0,1,2,3,4,5,6])
train_features.BsmtFinType2 = train_features.BsmtFinType2.replace(["No" ,"Unf","LwQ","Rec","BLQ","ALQ","GLQ"],[0,1,2,3,4,5,6])
train_features.CentralAir = train_features.CentralAir.replace(["N","Y"],[0,1])
train_features.Condition1 = train_features.Condition1.replace(["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"],[0,1,2,3,4,5,6,7,8])
train_features.Condition2 = train_features.Condition2.replace(["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"],[0,1,2,3,4,5,6,7,8])
train_features.ExterCond = train_features.ExterCond.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
train_features.ExterQual = train_features.ExterQual.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
train_features.Fence = train_features.Fence.replace(["GdPrv","MnPrv","GdWo","MnWw","NA"],[0,1,2,3,4])
train_features.FireplaceQu = train_features.FireplaceQu.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
train_features.Functional = train_features.Functional.replace(["Sal","Sev","Maj2","Maj1","Mod","Min2","Min1","Typ"],[1,2,3,4,5,6,7,8])
train_features.GarageType = train_features.GarageType.replace(["2Types","Attchd","Basment","BuiltIn","CarPort","Detchd","NA"],[0,1,2,3,4,5,6])
train_features.GarageFinish = train_features.GarageFinish.replace(["Fin","RFn","Unf","NA"],[0,1,2,3])
train_features.GarageQual = train_features.GarageQual.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
train_features.GarageCond = train_features.GarageCond.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
train_features.HeatingQC = train_features.HeatingQC.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
train_features.KitchenQual = train_features.KitchenQual.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
train_features.LotShape = train_features.LotShape.replace(["IR3","IR2","IR1","Reg"],[1,2,3,4])
train_features.MasVnrType = train_features.MasVnrType.replace(["BrkCmn","BrkFace","CBlock","None","Stone"],[0,1,2,3,4])
train_features.MiscFeature = train_features.MiscFeature.replace(["Elev","Gar2","Othr","Shed","TenC","NA"],[0,1,2,3,4,5])
train_features.PavedDrive = train_features.PavedDrive.replace(["N","P","Y"],[0,1,2])
train_features.PoolQC = train_features.PoolQC.replace(["No","Fa","TA","Gd","Ex"],[0,1,2,3,4])
train_features.SaleCondition = train_features.SaleCondition.replace(["Abnorml","Alloca","AdjLand","Family","Normal","Partial"],[0,1,2,3,4,5])
train_features.Utilities = train_features.Utilities.replace(["ELO","NoSeWa","NoSewr","AllPub"],[1,2,3,4])

In [59]:
train_features = train_features[['GrLivArea','Alley','BedroomAbvGr','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFullBath','BsmtHalfBath','BsmtUnfSF','CentralAir','Condition1','Condition2','EnclosedPorch','ExterCond','ExterQual','Fence','FireplaceQu','Fireplaces','Functional','GarageType','GarageFinish','GarageQual','GarageCond','GarageArea','GarageCars','HalfBath','HeatingQC','KitchenAbvGr','KitchenQual','LotShape','MasVnrType','MasVnrArea','MiscFeature','MiscVal','OpenPorchSF','PavedDrive','PoolQC','PoolArea','SaleCondition','ScreenPorch','TotRmsAbvGrd','Utilities','WoodDeckSF']]
train_features.columns.size

45

In [60]:
# Get data
test = pd.read_csv("test.csv")
print("test : " + str(test.shape))

test : (1459, 80)


In [61]:
# Check for duplicates   
if len(test.Id) == len(set(test.Id)):
    print "no duplicates"
else:
    print "duplicates"
test_ids = test["Id"]
test = test.drop("Id", axis = 1)

no duplicates


In [62]:
test.Alley = test.Alley.fillna("None")
test.BedroomAbvGr = test.BedroomAbvGr.fillna(0)
test.BsmtQual = test.BsmtQual.fillna("No")
test.BsmtCond = test.BsmtCond.fillna("No")
test.BsmtExposure = test.BsmtExposure.fillna("No")
test.BsmtFinType1 = test.BsmtFinType1.fillna("No")
test.BsmtFinType2 = test.BsmtFinType2.fillna("No")
test.BsmtFullBath = test.BsmtFullBath.fillna(0)
test.BsmtHalfBath = test.BsmtHalfBath.fillna(0)
test.BsmtUnfSF = test.BsmtUnfSF.fillna(0)
test.CentralAir = test.CentralAir.fillna("N")
test.Condition1 = test.Condition1.fillna("Norm")
test.Condition2 = test.Condition2.fillna("Norm")
test.EnclosedPorch = test.EnclosedPorch.fillna(0)
test.ExterCond = test.ExterCond.fillna("TA")
test.ExterQual = test.ExterQual.fillna("TA")
test.Fence = test.Fence.fillna("NA")
test.FireplaceQu = test.FireplaceQu.fillna("No")
test.Fireplaces = test.Fireplaces.fillna(0)
test.Functional = test.Functional.fillna("Typ")
test.GarageType = test.GarageType.fillna("NA")
test.GarageFinish = test.GarageFinish.fillna("NA")
test.GarageQual = test.GarageQual.fillna("No")
test.GarageCond = test.GarageCond.fillna("No")
test.GarageArea = test.GarageArea.fillna(0)
test.GarageCars = test.GarageCars.fillna(0)
test.HalfBath = test.HalfBath.fillna(0)
test.HeatingQC = test.HeatingQC.fillna("TA")
test.KitchenAbvGr = test.KitchenAbvGr.fillna(0)
test.KitchenQual = test.KitchenQual.fillna("TA")
test.LotShape = test.LotShape.fillna("Reg")
test.MasVnrType = test.MasVnrType.fillna("None")
test.MasVnrArea = test.MasVnrArea.fillna(0)
test.MiscFeature = test.MiscFeature.fillna("NA")
test.MiscVal = test.MiscVal.fillna(0)
test.OpenPorchSF = test.OpenPorchSF.fillna(0)
test.PavedDrive = test.PavedDrive.fillna("N")
test.PoolQC = test.PoolQC.fillna("No")
test.PoolArea = test.PoolArea.fillna(0)
test.SaleCondition = test.SaleCondition.fillna("Normal")
test.ScreenPorch = test.ScreenPorch.fillna(0)
test.TotRmsAbvGrd = test.TotRmsAbvGrd.fillna(0)
test.Utilities = test.Utilities.fillna("AllPub")
test.WoodDeckSF = test.WoodDeckSF.fillna(0)

In [63]:
test.Alley = test.Alley.replace(["None","Grvl","Pave"],[0,1,2])
test.BsmtQual = test.BsmtQual.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
test.BsmtCond = test.BsmtCond.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
test.BsmtExposure = test.BsmtExposure.replace(["No","Mn","Av","Gd"],[0,1,2,3])
test.BsmtFinType1 = test.BsmtFinType1.replace(["No" ,"Unf","LwQ","Rec","BLQ","ALQ","GLQ"],[0,1,2,3,4,5,6])
test.BsmtFinType2 = test.BsmtFinType2.replace(["No" ,"Unf","LwQ","Rec","BLQ","ALQ","GLQ"],[0,1,2,3,4,5,6])
test.CentralAir = test.CentralAir.replace(["N","Y"],[0,1])
test.Condition1 = test.Condition1.replace(["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"],[0,1,2,3,4,5,6,7,8])
test.Condition2 = test.Condition2.replace(["Artery","Feedr","Norm","RRNn","RRAn","PosN","PosA","RRNe","RRAe"],[0,1,2,3,4,5,6,7,8])
test.ExterCond = test.ExterCond.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
test.ExterQual = test.ExterQual.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
test.Fence = test.Fence.replace(["GdPrv","MnPrv","GdWo","MnWw","NA"],[0,1,2,3,4])
test.FireplaceQu = test.FireplaceQu.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
test.Functional = test.Functional.replace(["Sal","Sev","Maj2","Maj1","Mod","Min2","Min1","Typ"],[1,2,3,4,5,6,7,8])
test.GarageType = test.GarageType.replace(["2Types","Attchd","Basment","BuiltIn","CarPort","Detchd","NA"],[0,1,2,3,4,5,6])
test.GarageFinish = test.GarageFinish.replace(["Fin","RFn","Unf","NA"],[0,1,2,3])
test.GarageQual = test.GarageQual.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
test.GarageCond = test.GarageCond.replace(["No","Po","Fa","TA","Gd","Ex"],[0,1,2,3,4,5])
test.HeatingQC = test.HeatingQC.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
test.KitchenQual = test.KitchenQual.replace(["Po","Fa","TA","Gd","Ex"],[1,2,3,4,5])
test.LotShape = test.LotShape.replace(["IR3","IR2","IR1","Reg"],[1,2,3,4])
test.MasVnrType = test.MasVnrType.replace(["BrkCmn","BrkFace","CBlock","None","Stone"],[0,1,2,3,4])
test.MiscFeature = test.MiscFeature.replace(["Elev","Gar2","Othr","Shed","TenC","NA"],[0,1,2,3,4,5])
test.PavedDrive = test.PavedDrive.replace(["N","P","Y"],[0,1,2])
test.PoolQC = test.PoolQC.replace(["No","Fa","TA","Gd","Ex"],[0,1,2,3,4])
test.SaleCondition = test.SaleCondition.replace(["Abnorml","Alloca","AdjLand","Family","Normal","Partial"],[0,1,2,3,4,5])
test.Utilities = test.Utilities.replace(["ELO","NoSeWa","NoSewr","AllPub"],[1,2,3,4])

In [64]:
test_features = test[['GrLivArea','Alley','BedroomAbvGr','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFullBath','BsmtHalfBath','BsmtUnfSF','CentralAir','Condition1','Condition2','EnclosedPorch','ExterCond','ExterQual','Fence','FireplaceQu','Fireplaces','Functional','GarageType','GarageFinish','GarageQual','GarageCond','GarageArea','GarageCars','HalfBath','HeatingQC','KitchenAbvGr','KitchenQual','LotShape','MasVnrType','MasVnrArea','MiscFeature','MiscVal','OpenPorchSF','PavedDrive','PoolQC','PoolArea','SaleCondition','ScreenPorch','TotRmsAbvGrd','Utilities','WoodDeckSF']]
test_features.columns.size

45

In [65]:
print(train_features.shape)
print(train_labels.shape)
print(test_features.shape)

(1458, 45)
(1458,)
(1459, 45)


In [66]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=3, random_state=0)
regr.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [67]:
test_labels = regr.predict(test_features)
test_features["SalePrice"] = test_labels
test_features["Id"] = test_ids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [71]:
test_results = test_features[['Id','SalePrice']]
file_name = "submission.csv"
test_results.to_csv(file_name, encoding='utf-8', index=False)
print(test_results)

        Id  SalePrice
0     1461 134901.519
1     1462 137498.091
2     1463 184304.117
3     1464 185159.916
4     1465 173207.906
5     1466 176097.286
6     1467 142646.398
7     1468 163504.600
8     1469 149161.144
9     1470 138953.493
10    1471 173207.906
11    1472 116424.110
12    1473 119473.013
13    1474 159110.174
14    1475 138594.424
15    1476 367080.910
16    1477 283706.178
17    1478 312903.291
18    1479 288075.616
19    1480 394433.100
20    1481 295587.149
21    1482 173207.906
22    1483 173207.906
23    1484 173207.906
24    1485 173207.906
25    1486 223630.159
26    1487 311837.005
27    1488 283706.178
28    1489 191266.814
29    1490 191869.745
...    ...        ...
1429  2890 116424.110
1430  2891 171605.228
1431  2892 107830.058
1432  2893 119738.796
1433  2894 107830.058
1434  2895 229491.733
1435  2896 223630.159
1436  2897 184304.117
1437  2898 161425.692
1438  2899 209764.960
1439  2900 138953.493
1440  2901 185000.894
1441  2902 173207.906
1442  2903