In [1]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [9]:
# Load the dataset
df = pd.read_csv('ames_housing.csv')
df.head(2)

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000


In [10]:
feature_columns = ['Lot Area', 'Gr Liv Area', 'Garage Area', 'Bldg Type']
selected = df.loc[:, feature_columns + ['SalePrice']]
selected.head(2)

Unnamed: 0,Lot Area,Gr Liv Area,Garage Area,Bldg Type,SalePrice
0,31770,1656,528.0,1Fam,215000
1,11622,896,730.0,1Fam,105000


In [11]:
# Categorized feature needs to be handle
cat_feature = ['Bldg Type']

In [19]:
def prepare_data(target_df):
    df = target_df
    for col in df.columns:
        if col in cat_feature:
            # One hot encoding
            dummies = pd.get_dummies(df[col])
            # drop original column
            df = pd.concat([df.drop([col], axis=1), dummies], axis=1)
    
    # Fill missing value with 0
    df = df.fillna(0)
    
    return df

In [23]:
def train_and_evaluate(df_prc):
    # Remove target feature and convert other to numpy format
    faetures = df_prc.drop('SalePrice', axis=1).to_numpy()
    target = df_prc.drop('SalePrice', axis=1).to_numpy()

    # Split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(faetures, target, test_size=0.33, random_state=105)

    # Train the model
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    err = mean_squared_error(y_test, y_pred)
    print(f'MSE: {err}\n')
    

### Here is the place we want to try model with different feature combination

In [24]:
# at first step to train model with all columns, add None to target dataframe
col_to_drop = feature_columns + [None]

for to_drop in col_to_drop:
    if to_drop:
        dropped = selected.drop(to_drop, axis=1)
    else:
        dropped = selected
        
    print(f'Dropping {to_drop}')
    prepared = prepare_data(dropped)
    train_and_evaluate(prepared)

Dropping Lot Area
MSE: 3.2679353132775813e-25

Dropping Gr Liv Area
MSE: 9.55447781095485e-25

Dropping Garage Area
MSE: 3.54298716808135e-24

Dropping Bldg Type
MSE: 6.791457176884717e-23

Dropping None
MSE: 3.4843232332456673e-25

