In [56]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from collections import defaultdict
from numpy import log, abs, sqrt, exp

In [57]:
def cross_val(clf, train, cv=5):
    scores = cross_val_score(clf, train.drop(["SalePrice", "Id"], axis=1), train["SalePrice"], cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
    return sqrt(abs(scores)).mean()

def get_onehots(df, cols):
    """
    Creat a dictionary of lists, where each list corresponds to the unique
    non-null values in a particular column of the dataframe.
    """
    vals = defaultdict(list)
    for c in cols:
        for v in df[c].dropna().unique():
            vals[c].append(v)
    return dict(vals)

def set_onehots(df, vals, drop=True):
    """
    Take a dictionary as created by get_onehots and create one-hot encoded
    columns for each value of each column of interest.
    """
    for c in vals.keys():
        for v in vals[c]:
            df[c + '_' + str(v)] = df[c].apply(lambda x: 1 if x == v else 0)
        if drop:
            df.drop(c, inplace=True, axis=1)
            

In [58]:
def get_data():
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')
    
    #Log time
    train["SalePrice"] = log(train["SalePrice"])
    
    # Get rid of the bullshit
    train.drop("Id", axis=1)
    test.drop("GarageYrBlt", axis=1)
    train.drop("GarageYrBlt", axis=1)
    
    # Do numerical processing on these assholes
    ncols = [c for c, d in zip(train.columns, train.dtypes) if str(d) in ["float64", "int64"]]
    ncols.remove("Id")
    ncols.remove("SalePrice")
    for c in ncols:
        train[c].fillna(0, inplace=True)
        test[c].fillna(0, inplace=True)
    
    cats = [c for c, d in zip(train.columns, train.dtypes) if str(d) == 'object']
    # Turn these fuckers into strings

    for c in cats:
        train[c] = train[c].astype(str)
        test[c] = test[c].astype(str)
        
    # One hot these bitches (not in a sexist way)
    onehotvals = get_onehots(train, cats)
    set_onehots(train, onehotvals, drop=True)
    set_onehots(test, onehotvals, drop=True)
    
    
    maxs = [train[c].max() for c in ncols]
    mins = [train[c].min() for c in ncols]
    for c, mx, mn in zip(ncols, maxs, mins):
        train[c] = (train[c] - mn) / (mx - mn)
        test[c] = (test[c] - mn) / (mx - mn)
    

    return train, test        

        


train, test = get_data()

In [59]:
best_ratio = .73
best_alpha = 0.00058

lr = []

def grad(learning_rate=0.1, n_estimators=100):
    clf3 = GradientBoostingRegressor(learning_rate, n_estimators)
    cv = cross_val(clf3, train)
    return cv



GradientBoosting
Expected performance: 0.125798746874

Elastic Net: alpha=0.00058, l1_ratio=0.73
Expected performance: 0.132932338117

RandomForest: n_estimators=100
Expected performance: 0.143217562358



In [None]:

def elastic():
    clf1 = ElasticNet(alpha=best_alpha, l1_ratio=best_ratio)
    cv = cross_val(clf1, train)
    return cv
    #print("Elastic Net: alpha={}, l1_ratio={}".format(best_alpha, best_ratio))
    #print("Expected performance: {}".format(cv))
    #print("")
    
def rrf():
    clf2 = RandomForestRegressor(100)
    cv = cross_val(clf2, train)
    return cv
    #print("RandomForest: n_estimators={}".format(100))
    #print("Expected performance: {}".format(cv))
    #print("")