## Introduction

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv('AmesHousing.tsv', sep='\t')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [4]:
def transform_features(df):
    return df

def select_features(df):
    return df[['Gr Liv Area', 'SalePrice']]

In [5]:
def train_and_test(data):
    train = data[:1460]
    
    test = data[1460:]
    
    x_train = train.select_dtypes(exclude=['object']).drop('SalePrice',axis=1)
    y_train = train['SalePrice']
    
    x_test = test.select_dtypes(exclude=['object']).drop('SalePrice',axis=1)
    y_test = test['SalePrice']
    
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    
    preds = lr.predict(x_test)
    return mean_squared_error(y_test, preds)**0.5

In [6]:
data_transform = transform_features(data)
data_selected = select_features(data_transform)

rmse = train_and_test(data_selected)
print(rmse)

57088.25161263909


## Feature Engineering

* All Columns: 
    - drop the columns above 5% missing values
* Text Columns:
    - drop the columns with any missing values
* Numerical columns:
    - fill the missing value with the mode

All Columns: Drop all columns above 5% missing values

In [7]:
cols_remove = data.columns[data.isnull().sum() > data.shape[0] * 0.05]
cols_remove

Index(['Lot Frontage', 'Alley', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt',
       'Garage Finish', 'Garage Qual', 'Garage Cond', 'Pool QC', 'Fence',
       'Misc Feature'],
      dtype='object')

In [8]:
data_cleaned = data.drop(cols_remove, axis=1)

Text Columns: Drop any columns with missing values    

In [9]:
text_col_df = data_cleaned.select_dtypes(include='object')
text_cols_remove = text_col_df.columns[text_col_df.isnull().sum() > 0]

In [10]:
data_cleaned = data_cleaned.drop(text_cols_remove, axis=1)

Numerical columns: fill the missing value with the mode

In [11]:
for col in data_cleaned.columns[data_cleaned.isnull().sum() > 0]:
    mode = data_cleaned[col].mode()[0]
    data_cleaned[col] = data_cleaned[col].fillna(mode).isnull().sum()

In [12]:
data_cleaned.isnull().sum().value_counts()

0    64
dtype: int64

In [13]:
def transform_features(data):
    cols_remove = data.columns[data.isnull().sum() > data.shape[0] * 0.05]
    data_cleaned = data.drop(cols_remove, axis=1)
    
    text_col_df = data_cleaned.select_dtypes(include='object')
    text_cols_remove = text_col_df.columns[text_col_df.isnull().sum() > 0]
    data_cleaned = data_cleaned.drop(text_cols_remove, axis=1)
    
    num_missing = data_cleaned.select_dtypes(include=['int', 'float']).isnull().sum()
    fixable_numeric_cols = num_missing[(num_missing < len(df)/20) & (num_missing > 0)].sort_values()
    replacement_values_dict = data_cleaned[fixable_numeric_cols.index].mode().to_dict(orient='records')[0]
    data_cleaned = data_cleaned.fillna(replacement_values_dict)
    
    years_sold = data_cleaned['Yr Sold'] - data_cleaned['Year Built']
    years_since_remod = data_cleaned['Yr Sold'] - data_cleaned['Year Remod/Add']
    data_cleaned['Years Before Sale'] = years_sold
    data_cleaned['Years Since Remod'] = years_since_remod
    data_cleaned = data_cleaned.drop([1702, 2180, 2181], axis=0)

    data_cleaned = data_cleaned.drop(["PID", "Order", "Mo Sold", "Sale Condition", "Sale Type", "Year Built", "Year Remod/Add"], axis=1)
    return data_cleaned

In [14]:
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

55275.36731241307

## Feature Selection

In [15]:
numerical_df = transform_df.select_dtypes(include=['int64', 'float64'])
numerical_df.head(5)

Unnamed: 0,MS SubClass,Lot Area,Overall Qual,Overall Cond,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,1st Flr SF,...,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Yr Sold,SalePrice,Years Before Sale,Years Since Remod
0,20,31770,6,5,112.0,639.0,0.0,441.0,1080.0,1656,...,62,0,0,0,0,0,2010,215000,50,50
1,20,11622,5,6,0.0,468.0,144.0,270.0,882.0,896,...,0,0,0,120,0,0,2010,105000,49,49
2,20,14267,6,6,108.0,923.0,0.0,406.0,1329.0,1329,...,36,0,0,0,0,12500,2010,172000,52,52
3,20,11160,7,5,0.0,1065.0,0.0,1045.0,2110.0,2110,...,0,0,0,0,0,0,2010,244000,42,42
4,60,13830,5,5,0.0,791.0,0.0,137.0,928.0,928,...,34,0,0,0,0,0,2010,189900,13,12


In [16]:
abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
abs_corr_coeffs

BsmtFin SF 2         0.006127
Misc Val             0.019273
Yr Sold              0.030358
3Ssn Porch           0.032268
Bsmt Half Bath       0.035875
Low Qual Fin SF      0.037629
Pool Area            0.068438
MS SubClass          0.085128
Overall Cond         0.101540
Screen Porch         0.112280
Kitchen AbvGr        0.119760
Enclosed Porch       0.128685
Bedroom AbvGr        0.143916
Bsmt Unf SF          0.182751
Lot Area             0.267520
2nd Flr SF           0.269601
Bsmt Full Bath       0.276258
Half Bath            0.284871
Open Porch SF        0.316262
Wood Deck SF         0.328183
BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Years Since Remod    0.534985
Full Bath            0.546118
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qual         0.801206
SalePrice 

In [17]:
## Let's only keep columns with a correlation coefficient of larger than 0.4 (arbitrary, worth experimenting later!)
abs_corr_coeffs[abs_corr_coeffs > 0.4]

BsmtFin SF 1         0.439284
Fireplaces           0.474831
TotRms AbvGrd        0.498574
Mas Vnr Area         0.506983
Years Since Remod    0.534985
Full Bath            0.546118
Years Before Sale    0.558979
1st Flr SF           0.635185
Garage Area          0.641425
Total Bsmt SF        0.644012
Garage Cars          0.648361
Gr Liv Area          0.717596
Overall Qual         0.801206
SalePrice            1.000000
Name: SalePrice, dtype: float64

In [18]:
## Drop columns with less than 0.4 correlation with SalePrice
transform_df = transform_df.drop(abs_corr_coeffs[abs_corr_coeffs < 0.4].index, axis=1)

Which categorical columns should we keep?

In [19]:
## Create a list of column names from documentation that are *meant* to be categorical
nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]

* Which columns are currently numerical but need to be encoded as categorical instead (because the numbers don't have any semantic meaning)?
* If a categorical column has hundreds of unique values (or categories), should we keep it? When we dummy code this column, hundreds of columns will need to be added back to the data frame.

In [20]:
## Which categorical columns have we still carried with us? We'll test tehse 
transform_cat_cols = []
for col in nominal_features:
    if col in transform_df.columns:
        transform_cat_cols.append(col)

## How many unique values in each categorical column?
uniqueness_counts = transform_df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
## Aribtrary cutoff of 10 unique values (worth experimenting)
drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index
transform_df = transform_df.drop(drop_nonuniq_cols, axis=1)

In [21]:
## Select just the remaining text columns and convert to categorical
text_cols = transform_df.select_dtypes(include=['object'])
for col in text_cols:
    transform_df[col] = transform_df[col].astype('category')
    
## Create dummy columns and add back to the dataframe!
transform_df = pd.concat([
    transform_df, 
    pd.get_dummies(transform_df.select_dtypes(include=['category']))
], axis=1).drop(text_cols,axis=1)

In [22]:
def select_features(df, coeff_threshold=0.4, uniq_threshold=10):
    numerical_df = df.select_dtypes(include=['int64', 'float64'])
    abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()
    df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)
    
    nominal_features = ["PID", "MS SubClass", "MS Zoning", "Street", "Alley", "Land Contour", "Lot Config", "Neighborhood", 
                    "Condition 1", "Condition 2", "Bldg Type", "House Style", "Roof Style", "Roof Matl", "Exterior 1st", 
                    "Exterior 2nd", "Mas Vnr Type", "Foundation", "Heating", "Central Air", "Garage Type", 
                    "Misc Feature", "Sale Type", "Sale Condition"]
    
    transform_cat_cols = []
    for col in nominal_features:
        if col in df.columns:
            transform_cat_cols.append(col)

    uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()
    drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index
    df = df.drop(drop_nonuniq_cols, axis=1)
    
    text_cols = df.select_dtypes(include=['object'])
    for col in text_cols:
        df[col] = df[col].astype('category')
    df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols,axis=1)
    
    return df

In [23]:
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

33367.28718340309

In [24]:
filtered_df.shape

(2927, 130)

## Train And Test

In [40]:
def train_and_test(data, k=0):
    if k == 0:
        train = data[:1460]

        test = data[1460:]

        x_train = train.drop('SalePrice',axis=1)
        y_train = train['SalePrice']

        x_test = test.drop('SalePrice',axis=1)
        y_test = test['SalePrice']

        lr = LinearRegression()
        lr.fit(x_train, y_train)

        preds = lr.predict(x_test)
        return mean_squared_error(y_test, preds)**0.5
    if k == 1:
        fold_one = data[:1460]
        fold_two = data[1460:]
        
        x_fold_one = fold_one.drop('SalePrice', axis=1)
        y_fold_one = fold_one['SalePrice']
        
        x_fold_two = fold_two.drop('SalePrice', axis=1)
        y_fold_two = fold_two['SalePrice']
        
        lr_one = LinearRegression()
        lr_one.fit(x_fold_one, y_fold_one)
        preds_two = lr_one.predict(x_fold_two)
        
        lr_two = LinearRegression()
        lr_two.fit(x_fold_two, y_fold_two)
        preds_one = lr_one.predict(x_fold_one)
        
        rmse_one = mean_squared_error(y_fold_one, preds_one)**0.5
        rmse_two = mean_squared_error(y_fold_two, preds_two)**0.5
        return (rmse_one+rmse_two)/2
    if k > 1:
        kf = KFold(n_splits=k, shuffle=True, random_state=1)
        
        x_data = data.drop('SalePrice', axis=1)
        y_data = data['SalePrice']
        
        lr = LinearRegression()
        mses = cross_val_score(lr, x_data, y_data, scoring='neg_root_mean_squared_error', cv=kf)
        print(np.abs(mses))
        return np.mean(np.abs(mses))
        

In [41]:
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df)

rmse

33367.28718340309

In [42]:
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df, k=1)

rmse

28272.76491817604

In [43]:
df = pd.read_csv("AmesHousing.tsv", delimiter="\t")
transform_df = transform_features(df)
filtered_df = select_features(transform_df)
rmse = train_and_test(filtered_df, k=4)

rmse

[36756.52485284 25652.0636658  25571.38314607 28465.76822678]


29111.434972870695