In [87]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [88]:
# Load input from csv file
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [89]:
# Understand input data types
# Remember, we can only fit models on numeric data (like ints,floats,etc.)
non_numeric_features = []
for i in train_data:
    dtype = train_data[i].dtype
    if 'int' not in str(dtype) and 'float' not in str(dtype):
        # print(i)
        non_numeric_features.append(i)

In [90]:
# The most basic approach is to a build a model
# without any preprocessing 
# Approach 1: ignore non numeric columns and let's keep features that we probably don't need (such as Id)
# This approach can be our baseline as we optimize

In [103]:
# First, we need some utility functions
def preprocess_data(data):
    """Preprocess data. E.g. normalize/scale/handle NA/etc."""

    # Replace missing values with 0
    return data.fillna(0)

def kfold_cv(reg, X, y, k: int = 5, metric = sklearn.metrics.mean_squared_error, shuffle: bool = True):
    """Perform K-fold cross validation"""
    metric_scores = []
    for train_index,test_index in sklearn.model_selection.KFold(k, shuffle=shuffle):
        X_train, X_validate = X[train_index], X[test_index]
        y_train, y_validate = y[train_index], y[test_index]
        reg.fit(X_train,y_train)
        metric_scores.append(metric(reg.predict(X_validate), y_validate))
    return np.array(metric_scores).mean()

def train_test_split(X, y, test_size: float =.1, random_state: int = 1, shuffle: bool = True):
    """Wrapper over sklearn train_test_split with defautl test_size = 10%."""
    return sklearn.model_selection.train_test_split(X,y,test_size=test_size,random_state=random_state,shuffle=shuffle)

def generate_submission(reg, X_test, name: str):
    """Generate submission file."""
    y_pred = reg.predict(X_test)
    result = pd.DataFrame()
    result['Id'] = X_test['Id']
    result['SalePrice'] = y_pred
    result.to_csv(name, index=False)

def get_X_y(data: pd.DataFrame, ignore_columns: list = [], only_X: bool = False):
    """Input data shares similar format to csv. Removes target column returns copies of the original data."""
    if only_X:
        #print(ignore_columns)
        #ignore_columns.remove('SalePrice')
        pass
    data = data[[col_name for col_name in data if col_name not in ignore_columns]].copy()
    if not only_X:
        test = data['SalePrice']
        train = data.drop('SalePrice', axis=1)
        return train,test
    else:
        return data, None

In [104]:
# Approach 1
traindata1 = preprocess_data(train_data.copy())
testdata1 = preprocess_data(test_data.copy())
train_1, test_1 = get_X_y(traindata1, non_numeric_features)
final_test, _ = get_X_y(testdata1, non_numeric_features, only_X=True)

reg = sklearn.ensemble.RandomForestRegressor()
reg.fit(train_1, test_1)

score = reg.score(train_1, test_1)
print(f'R^2 score on training data: {score} (better when this value is near 1)')

generate_submission(reg, final_test, 'submit/baseline.csv')

R^2 score on training data: 0.9815119433085853 (better when this value is near 1)
