In [1]:
# Import pandas
import pandas as pd
# Import matplotlib
import matplotlib.pyplot as plt
# Import autocleaner for cleaning data
from datacleaner import autoclean

In [2]:
CAT_VARS = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 
            'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
            'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
            'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
            'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
            'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
            'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
            'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
            'MoSold', 'SaleType', 'SaleCondition']
CONT_VARS = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',
             'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
             'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
             'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
             'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars',
             'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
             'ScreenPorch', 'PoolArea', 'MiscVal', 'YrSold']

TARGET_VAR = ['SalePrice',]

In [3]:
def dummify(data):
    try:
        dummified = data[CONT_VARS + TARGET_VAR]
    except:
        dummified = data[CONT_VARS]

    for cat_var in CAT_VARS:
        dummies = pd.get_dummies(data[cat_var], prefix=cat_var)
        dummified = dummified.join(dummies)

    return dummified

In [4]:
# Read in the data.
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

train_df.set_index('Id', inplace=True)
test_df.set_index('Id', inplace=True)

# Clean data: replace missing values and convert categorical to numerical.
train_df = autoclean(train_df)
test_df = autoclean(test_df)

# Dummify: convert categorical to dummy
train_df = dummify(train_df)
test_df = dummify(test_df)

# Normalize.
# ...

print test_df.shape

# Create .csv with cleaned data
train_df.to_csv('../data/clean_train.csv', sept=',', index=False)
test_df.to_csv('../data/clean_test.csv', sept=',', index=False)

(1459, 296)


In [5]:
# Get all the columns from the dataframe.
columns = train_df.columns.tolist()
columns.remove(TARGET_VAR[0])

# Store the variable we'll be predicting on.
target = TARGET_VAR[0]

# Import a convenience function to split the sets.
from sklearn.cross_validation import train_test_split

# Generate the training set.  Set random_state to be able to replicate results.
train = train_df.sample(frac=0.8, random_state=1)
# Select anything not in the training set and put it in the testing set.
validation = train_df.loc[~train_df.index.isin(train.index)]
# Print the shapes of both sets.

print "train data:"
print train.shape
print "test data:"
print validation.shape

train data:
(1168, 314)
test data:
(292, 314)


# Linear Regression
Test model on evaluation set.

In [6]:
# Import the linearregression model.
from sklearn.linear_model import LinearRegression

# Initialize the model class.
model = LinearRegression()
# Fit the model to the training data.
model.fit(train[columns], train[target])

# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error

# Generate our predictions for the validation set.
predictions_val = model.predict(validation[columns])

# Compute error between our validation predictions and the actual values.
mean_squared_error(predictions_val, validation[target])

497510554.17358983

Apply model to test set.

In [7]:
import numpy as np

# Prepare test set:
train_cols = set(columns)
test_cols = set(test_df.columns.tolist())
missing_train = test_cols - train_cols
missing_test = train_cols - test_cols
# (1) Delete columns which are missing in train_df
test_df.drop(missing_train, axis=1, inplace=True)
# (2) Create 0-filled columns which are missing in test_df
for col in missing_test:
    test_df[col] = 0

# Generate our predictions for the test set.
predictions_val = model.predict(test_df[columns])
#print predictions_val

idx = test_df.index.values

submission = np.vstack((idx, predictions_val)).T


submission = pd.DataFrame(data=submission[0:, 0:],
                          index=submission[0:, 0],
                          columns=['Id', 'SalePrice'])

submission.to_csv('../data/submission.csv', sept=',', index=False)

# Random Forest

In [8]:
import numpy as np

# Import the random forest model.
from sklearn.ensemble import RandomForestRegressor

# Initialize the model with some parameters.
model = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
# Fit the model to the data.
model.fit(train[columns], train[target])
# Make predictions.
predictions = model.predict(validation[columns])

# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error
# Compute the error.
mean_squared_error(predictions, validation[target])

633687977.68869162