In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from scipy.stats import boxcox
%matplotlib inline

## Load Data and Merge

In [2]:
## Load data
properties = pd.read_csv('data/properties_2016.csv')
train = pd.read_csv('data/train_2016_v2.csv')
all_data = properties.merge(train, how='right', on='parcelid')

  interactivity=interactivity, compiler=compiler, result=result)


### Adjusting Data Types

We'll convert all categorical values so that the missing value is a feature in itself.

In [3]:
# Deal with Na values and adjust type
def categorize(s, na_value = 0):
    return s.astype('category').cat.add_categories(na_value).fillna(na_value)

In [4]:
# Adjust categorical variables
def preprocess(df, fill_value):
    
    # Null Proportion Feature
    null_prop = df.apply(lambda x: x.count(), axis=1)/df.shape[1]
    
    df['airconditioningtypeid'] = categorize(df['airconditioningtypeid'])
    df['architecturalstyletypeid'] = categorize(df['architecturalstyletypeid'])
    df['buildingclasstypeid'] = categorize(df['buildingclasstypeid'])
    df['buildingqualitytypeid'] = categorize(df['buildingqualitytypeid'])
    df['hasdeck'] = df['decktypeid'].notnull()
    df.drop('decktypeid', axis=1, inplace=True)

    df['fireplaceflag'] = df['fireplaceflag'] == True
    df['hashottuborspa'] = df['hashottuborspa'] == True
    df['heatingorsystemtypeid'] = categorize(df['heatingorsystemtypeid'])
    df.drop(['pooltypeid10','pooltypeid2','pooltypeid7'], axis=1, inplace=True)

    df['propertycountylandusecode'] = categorize(df['propertycountylandusecode'],na_value='0000')
    df['propertylandusetypeid'] = categorize(df['propertylandusetypeid'])
    df['propertyzoningdesc'] = categorize(df['propertyzoningdesc'],'0000')
    df.drop(['rawcensustractandblock','censustractandblock'],axis=1,inplace=True)

    df['regionidcounty'] = categorize(df['regionidcounty'])
    df['regionidcity'] = categorize(df['regionidcity'])
    df['regionidzip'] = categorize(df['regionidzip'])
    df['regionidneighborhood'] = categorize(df['regionidneighborhood'])
    df['storytypeid'] = categorize(df['storytypeid'])
    df['typeconstructiontypeid'] = categorize(df['typeconstructiontypeid'])
    
    object_data = df.dtypes == 'category'
    
    numeric_columns = ['bathroomcnt', 'bedroomcnt', 'calculatedbathnbr', 
                       'calculatedfinishedsquarefeet', 'finishedsquarefeet12',
                       'fullbathcnt', 'latitude', 'longitude', 'lotsizesquarefeet', 
                       'roomcnt','yearbuilt', 'structuretaxvaluedollarcnt', 'taxvaluedollarcnt',
                       'assessmentyear', 'landtaxvaluedollarcnt', 'taxamount']
    train_numeric = df.loc[:, numeric_columns]
    train_numeric.fillna(fill_value)
    train_numeric['calculatedfinishedsquarefeet'] = np.log(train_numeric['calculatedfinishedsquarefeet'])
    train_numeric['finishedsquarefeet12'] = np.log(train_numeric['finishedsquarefeet12'])[0]
    train_numeric['lotsizesquarefeet'] = np.log(train_numeric['lotsizesquarefeet'])
    train_numeric['structuretaxvaluedollarcnt'] = np.log(train_numeric['structuretaxvaluedollarcnt'])[0]
    train_numeric['taxvaluedollarcnt'] = np.log(train_numeric['taxvaluedollarcnt'])[0]
    train_numeric['landtaxvaluedollarcnt'] = np.log(train_numeric['landtaxvaluedollarcnt'])[0]
    train_numeric['taxamount'] = np.log(train_numeric['taxamount'])[0]
    train_numeric['null_prop'] = null_prop
    
    category_train = df.loc[:, object_data]
    category_train = pd.get_dummies(category_train.drop(['regionidneighborhood','regionidcity','regionidcounty','propertycountylandusecode','propertyzoningdesc','buildingqualitytypeid','regionidzip'],axis=1))
    category_train['buildingqualitytypeid'] = df['buildingqualitytypeid']
    
    X_train = pd.DataFrame(train_numeric).join(category_train)
    
    return X_train

In [5]:
# Outlier Removal
q1 = all_data['logerror'].quantile(0.25)
q3 = all_data['logerror'].quantile(0.75)
low_outlier = q1 - 3*(q3-q1)
high_outlier = q3 + 3*(q3-q1)
outlier_index = all_data[(all_data['logerror'] < low_outlier) | (all_data['logerror'] > high_outlier)].index
all_data = all_data.drop(outlier_index, axis=0)

fill_value = all_data.drop(['logerror','transactiondate'], axis=1).mean()
X_train = preprocess(all_data.drop(['logerror','transactiondate'], axis=1), fill_value)
X_train['month'] = pd.to_datetime(all_data['transactiondate']).dt.month
y_train = all_data['logerror']

In [6]:
X_test = preprocess(properties.drop('parcelid',axis=1), fill_value)

### Write Modified Versions

In [7]:
X_test.to_csv('data/modified_testV1.csv', index=False)
X_train.to_csv('data/modified_trainV1.csv', index=False)
y_train.to_csv('data/labels.csv', index=False)
properties['parcelid'].to_csv('data/parcelid.csv', index=False)

### Implementing Some Regressor Models
#### Random Forest

In [10]:
rf_search.best_score_

-0.044055953384106089

{'min_samples_leaf': 3, 'min_samples_split': 3, 'max_depth': 13, 'min_impurity_split': 0.001}


Randomized Search has provided us with these optimal parameters:
* Max Depth: 13
* Min impurity Spit: .001,
* Min Samples Leaf: 3
* Min Samples Split: 3

With 150 Estimators

In [7]:
# rf_optimal = rf_search.best_estimator_
rf_optimal = RandomForestRegressor(verbose=True, n_estimators=150, max_depth=13, min_impurity_split=0.001, min_samples_leaf=3, min_samples_split=3)
rf_optimal.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:  2.4min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=13,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=0.001, min_samples_leaf=3,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=150, n_jobs=1, oob_score=False, random_state=None,
           verbose=True, warm_start=False)

In [9]:
def get_predictions(classifier, test_set, train_set, n_splits=25):
    splitted = np.array_split(test_set.drop(set(test_set.columns) - set(train_set.columns),axis=1), n_splits)
    oct_preds = []
    nov_preds = []
    dec_preds = []

    cols = splitted[0].shape[1]

    count = 0
    for split in splitted:
        count += 1
        print("Predicting On {}".format(count))
        split = np.append(split, np.full([len(split),1], 10), axis=1)
        oct_pred = rf_optimal.predict(split)

        split[:, cols] = np.full(len(split), 11)
        nov_pred = rf_optimal.predict(split)

        split[:, cols] = np.full(len(split), 12)
        dec_pred = rf_optimal.predict(split)

        oct_preds.append(oct_pred)
        nov_preds.append(nov_pred)
        dec_preds.append(dec_pred)

    rf10_predictions = np.hstack(oct_preds)
    rf11_predictions = np.hstack(nov_preds)
    rf12_predictions = np.hstack(dec_preds)
    
    return rf10_predictions, rf11_predictions, rf12_predictions

rf10_predictions, rf11_predictions, rf12_predictions = get_predictions(rf_optimal, X_test, X_train)

Predicting On 1


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 2


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 3


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 4


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 5


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 6


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 7


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 8


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 9


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 10


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.1s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 11


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 12


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 13


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 14


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 15


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 16


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 17


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 18


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 19


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 20


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 21


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 22


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 23


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 24


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


Predicting On 25


[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    1.0s finished


In [11]:

output = pd.DataFrame({'ParcelId': properties['parcelid'].astype(np.int32),
        '201610': rf10_predictions, '201611': rf11_predictions, '201612': rf12_predictions,
        '201710': rf10_predictions, '201711': rf11_predictions, '201712': rf12_predictions})
# set col 'ParceID' to first col
cols = output.columns.tolist()
cols = cols[-1:] + cols[:-1]
output = output[cols]
from datetime import datetime

print( "\nWriting results to disk ..." )
output.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)
    


Writing results to disk ...


In [7]:
#### AdaBoost
ada = AdaBoostRegressor(n_estimators = 250)

ada_param_dists = {'learning_rate':[0.01,.02,.04,.08,.12,.16,.2,
                                   0.25,0.35,0.5,0.75,1.0,1.25,1.5,2]}

ada_optimal = RandomizedSearchCV(ada, ada_param_dists, scoring='neg_mean_absolute_error')

In [8]:
ada_optimal.fit(X_train, y_train)
print(ada_optimal.best_params_)
print(ada_optimal.best_score_)

{'learning_rate': 0.02}
-0.0444358096508


In [13]:
gbm_search.fit(X_train, y_train)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_depth': [2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3], 'min_impurity_split': [0.001, 0.0001, 1e-05, 1e-06, 1e-07, 1e-08, 1e-09, 1e-10], 'loss': ['ls', 'lad', 'huber'], 'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='neg_mean_absolute_error',
          ver

In [22]:
print(gbm_search.best_params_)
print(gbm_search.best_score_)
gbm_optimal = gbm_optimal.best_estimator_

NameError: name 'gbm_search' is not defined

In [23]:
gbm_optimal.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-05,
             min_samples_leaf=2, min_samples_split=5,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [24]:
gb10_predictions, gb11_predictions, gb12_predictions = get_predictions(gbm_optimal, X_test, X_train)

Predicting On 1
Predicting On 2
Predicting On 3
Predicting On 4
Predicting On 5
Predicting On 6
Predicting On 7
Predicting On 8
Predicting On 9
Predicting On 10
Predicting On 11
Predicting On 12
Predicting On 13
Predicting On 14
Predicting On 15
Predicting On 16
Predicting On 17
Predicting On 18
Predicting On 19
Predicting On 20
Predicting On 21
Predicting On 22
Predicting On 23
Predicting On 24
Predicting On 25


In [26]:
output = pd.DataFrame({'ParcelId': properties['parcelid'].astype(np.int32),
        '201610': gbm_predictions, '201611': gbm_predictions, '201612': gbm_predictions,
        '201710': gbm_predictions, '201711': gbm_predictions, '201712': gbm_predictions})
# set col 'ParceID' to first col
cols = output.columns.tolist()
cols = cols[-1:] + cols[:-1]
output = output[cols]
from datetime import datetime

print( "\nWriting results to disk ..." )
output.to_csv('sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)


Writing results to disk ...


In [19]:
sample_submission = pd.read_csv('sample_submission.csv')

In [20]:
lin = LinearRegression()





(2985217, 7)