In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageCond,81,0.055479
GarageType,81,0.055479
GarageYrBlt,81,0.055479
GarageFinish,81,0.055479


In [4]:
#dealing with missing data
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
df_train.isnull().sum().max() #just checking that there's no missing data missing..

0

In [5]:
y = df_train['SalePrice']
X = df_train.iloc[:,0:62]

In [6]:
from sklearn.preprocessing import LabelEncoder
for column in X.columns:
    if X[column].dtype == type(object):
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])

In [7]:
# Find most important features relative to target
print("Find most important features relative to target")
corr = df_train.corr()
corr.sort_values(["SalePrice"], ascending = False, inplace = True)
print(corr.SalePrice)

Find most important features relative to target
SalePrice        1.000000
OverallQual      0.791069
GrLivArea        0.708618
GarageCars       0.640473
GarageArea       0.623423
TotalBsmtSF      0.613905
1stFlrSF         0.605968
FullBath         0.560881
TotRmsAbvGrd     0.533779
YearBuilt        0.523273
YearRemodAdd     0.507430
Fireplaces       0.466968
BsmtFinSF1       0.386436
WoodDeckSF       0.324422
2ndFlrSF         0.319464
OpenPorchSF      0.315831
HalfBath         0.284400
LotArea          0.263837
BsmtFullBath     0.227082
BsmtUnfSF        0.214446
BedroomAbvGr     0.168235
ScreenPorch      0.111419
PoolArea         0.092397
MoSold           0.046380
3SsnPorch        0.044571
BsmtFinSF2      -0.011412
BsmtHalfBath    -0.016873
MiscVal         -0.021200
Id              -0.021756
LowQualFinSF    -0.025620
YrSold          -0.028907
OverallCond     -0.077924
MSSubClass      -0.084230
EnclosedPorch   -0.128627
KitchenAbvGr    -0.135935
Name: SalePrice, dtype: float64


In [8]:
clf = RandomForestRegressor(n_estimators=1000)
clf = clf.fit(X, y)
#clf.feature_importances_

In [9]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)

X_new.shape

(1459, 8)

In [10]:
selected_feat= X.columns[(model.get_support())]
selected_feat

Index(['OverallQual', 'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'GarageCars', 'GarageArea'],
      dtype='object')

In [11]:
X_new = X_new[:,[0,1,3,4,5,7]]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.33, random_state=46)
X_train.shape

(977, 6)

In [13]:
# Define error measure for official scoring : RMSE
scorer = make_scorer(mean_squared_error, greater_is_better = False)

def rmse_cv_train(model,X1,y1):
    rmse= np.sqrt(-cross_val_score(model, X1, y1, scoring = scorer, cv = 10))
    return(rmse)

def rmse_cv_test(model,X1,y1):
    rmse= np.sqrt(-cross_val_score(model, X1, y1, scoring = scorer, cv = 10))
    return(rmse)

In [14]:
# Define error measure for official scoring : RMSE
scorerR2 = make_scorer(r2_score)

def r2score_cv_train(model,X1,y1):
    rmse= cross_val_score(model, X1, y1, scoring = scorerR2, cv = 10)
    return(rmse)

def r2score_cv_test(model,X1,y1):
    rmse= cross_val_score(model, X1, y1, scoring = scorerR2, cv = 10)
    return(rmse)

In [15]:
poly = PolynomialFeatures(degree=1)

In [16]:
X_transformed = poly.fit_transform(X_train)

In [17]:
X_transformed.shape

(977, 7)

In [18]:
regpoly = LinearRegression()

In [19]:
regpoly.fit(X_transformed, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [20]:
X_test_transformed = poly.fit_transform(X_test)

In [21]:
# Look at predictions on training and validation set
print("RMSE on Training set :", rmse_cv_train(regpoly, X_transformed, y_train).mean())
print("RMSE on Test set :", rmse_cv_test(regpoly, X_test_transformed, y_test).mean())

RMSE on Training set : 39714.3687250022
RMSE on Test set : 34142.04783064988


In [22]:
# Look at predictions on training and validation set
print("R^2 score on Training set :", r2score_cv_train(regpoly, X_transformed, y_train).mean())
print("R^2 score on Test set :", r2score_cv_test(regpoly, X_test_transformed, y_test).mean())

R^2 score on Training set : 0.7225947581954459
R^2 score on Test set : 0.7741703183540738
