In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from scipy.stats import skew

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Scoring stuff
from sklearn.metrics import roc_curve, auc, mean_squared_error, r2_score

# Import model selection tools
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import GridSearchCV

# Import models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet
import xgboost as xgb # XGBRegressor

from sklearn.preprocessing import LabelEncoder







In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

trainY = train[['SalePrice']] # Creating a trainY dataframe
train_IDs = train.Id
test_IDs = test.Id

# Dropping ID
train.drop('Id', axis = 1, inplace = True) 
test.drop('Id', axis = 1, inplace = True)

# Dropping SalePrice
train.drop('SalePrice', axis = 1, inplace = True)

In [3]:
# sns.distplot(train.SalePrice)

In [4]:
def impute_missing_train(df):
    imputedColumns = ['PoolQC', 'MiscFeature', 'Alley', 'Fence',
                  'FireplaceQu', 'GarageType', 'GarageFinish',
                  'GarageQual', 'GarageCond', "BsmtFinType2",
                  "BsmtExposure", "BsmtFinType1", "BsmtCond",
                  "BsmtQual", "MasVnrType"] 
    
    for col in imputedColumns:
        df[col] = df[col].fillna('None')
        
    # Only a few missing values in the training set for these two columns.
    # Using 0, since if there is no garage, there is no year built. If no masonry veneer, there will be no area.
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)


    # Only 1 missing value in the training set. Imputing most common value for 'Electrical' which is SBrkr
    df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
    df['LotFrontage'] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
    
    return df

### Missing stuff in test dataset section:
def impute_missing_test(df):
    
# Imputing zero for columns that use numbers as measures (such as basement in square feet [BsmtFinSF])
    zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars',
                 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
                 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
    
    for col in zero_cols:
        df[col] = df[col].fillna(0)

    # Filling in the missing LotFrontage, accounts for 15% of missing data
    df['LotFrontage'] = df.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

    modeCols = ['MSZoning','Exterior1st', 'Exterior2nd', 'SaleType', 'Utilities', 'KitchenQual']
    # There are only a few missing values for these columns, so imputing with the mode.
    for col in modeCols:
        df[col] = df[col].fillna(df[col].mode()[0])

    # NA probably means no subclass
    df['MSSubClass'] = df['MSSubClass'].fillna("None")
    # Functional - data dictionary says assume Typical
    df['Functional'] = df['Functional'].fillna("Typ")  
    
    return df

In [5]:
impute_missing_train(train)
train.isnull().sum()[:5]

MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
Street         0
dtype: int64

In [6]:
impute_missing_train(test)
impute_missing_test(test)
test.isnull().sum()[:5]

MSSubClass     0
MSZoning       0
LotFrontage    0
LotArea        0
Street         0
dtype: int64

## Applying mathematical transformations to dataset
[Source](https://www.kaggle.com/mymkyt/simple-lasso-public-score-0-12102)

In [7]:
def transform(df):
    # Taking features that are not objects, which means either int or float.
    numeric_feats = df.dtypes[df.dtypes != "object"].index

    # Calculating skew, keeping numerical features to be transformed whose skew is > 0.75
    skewed_feats = df[numeric_feats].apply(lambda x: skew(x)) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    
    df[skewed_feats] = np.log1p(df[skewed_feats])
    
#     df = pd.get_dummies(df)


    return df


# LABEL ENCODER FUNCTION
def label_encoder(df):
    df_object = df.select_dtypes(include=['object']) # Taking the categorical columns only
    categorical_Cols = list(df_object.columns) # Inputting the categorical columns into a list

    # process columns, apply LabelEncoder to categorical features
    for c in categorical_Cols:
        lbl = LabelEncoder() 
        lbl.fit(list(df[c].values)) 
        df[c] = lbl.transform(list(df[c].values))

    # shape        
    print('Shape all_data: {}'.format(df.shape))
    
    return df

#### Training dataset transforms

In [8]:
w = transform(train)
w = label_encoder(w)

Shape all_data: (1460, 79)


#### Test dataset transforms

In [9]:
w2 = transform(test)
w2 = label_encoder(w2)

Shape all_data: (1459, 79)


In [10]:
print("Train shape:", w.shape)
print("Test shape:", w2.shape)

Train shape: (1460, 79)
Test shape: (1459, 79)


#### RMSE function

In [11]:
def RMSE(df):
    scoring = 'neg_mean_squared_error'
    num_folds = 5
    seed = 10
    kfold = KFold(n_splits = num_folds, random_state = seed)

    results = []
    names = []
    models = []

    models.append( ('LinearRegression', LinearRegression()) )
    models.append( ('RandomForestRegressor', RandomForestRegressor(random_state = seed)) )
    models.append( ('GradientBoostingRegressor', GradientBoostingRegressor(random_state = seed)) )
    models.append( ('Lasso', Lasso(random_state = seed)) )
    models.append( ('Ridge', Ridge(random_state = seed)) ) 
    models.append( ('ElasticNet', ElasticNet(random_state = seed)) )
    models.append( ('XGBRegressor', xgb.XGBRegressor(seed = seed)) )

    y = np.log1p(trainY.SalePrice)

   
    for name, model in models:
        kfold = KFold(n_splits = num_folds, random_state = seed)

        names.append(name)
        model.fit(df, y)

        scores = np.sqrt(-cross_val_score(model, df, y, cv = kfold, scoring = scoring))
        results.append(scores)

        msg = "{}: RMSE: {} ({})".format(name, scores.mean(), scores.std())
        
              
        print(msg)
        print()
                
    return results

In [12]:
RMSE(w)

LinearRegression: RMSE: 0.1356038110109544 (0.016447017765653837)

RandomForestRegressor: RMSE: 0.15043582730432753 (0.008376803401368899)

GradientBoostingRegressor: RMSE: 0.12763688213426305 (0.008384282964745533)

Lasso: RMSE: 0.2682893719781311 (0.010577842474056123)

Ridge: RMSE: 0.13516496946197698 (0.016762635632164576)

ElasticNet: RMSE: 0.26454219580925276 (0.010858647898008571)

XGBRegressor: RMSE: 0.13081765316459154 (0.009804400274497986)



[array([ 0.11696099,  0.14654425,  0.13453396,  0.11942718,  0.16055268]),
 array([ 0.14152407,  0.16451594,  0.1495733 ,  0.1426628 ,  0.15390302]),
 array([ 0.11578586,  0.14020592,  0.13283115,  0.12260762,  0.12675387]),
 array([ 0.24792768,  0.27478734,  0.27184539,  0.26915009,  0.27773636]),
 array([ 0.11573729,  0.14623121,  0.13380478,  0.11933849,  0.16071307]),
 array([ 0.24509993,  0.2716857 ,  0.26434127,  0.26437495,  0.27720913]),
 array([ 0.11610876,  0.14179439,  0.13809939,  0.12254411,  0.13554162])]

#### Submitting Output

In [13]:
# lasso = Lasso(alpha=0.0004)
seed = 10
model = GradientBoostingRegressor(random_state = seed)
y = np.log1p(trainY.SalePrice)


### prediction
model.fit(w, y)

preds = np.expm1(model.predict(w2))
solution = pd.DataFrame({"id": test_IDs, "SalePrice":preds})
solution.to_csv("GBR_v3.csv", index = False)