In [54]:
%matplotlib inline
# standard
import sys
import os
import re

# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import fancyimpute


from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder, Imputer, Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, tree
from sklearn.linear_model import Lasso, SGDRegressor, LinearRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.model_selection import learning_curve
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

from scipy import sparse

# needed for project imports
sys.path.append(os.path.join(os.getcwd(), "../.."))

# project imports
from housepredictor.extractor import extract_examples, FeatureExtractor, num_cols, extract_textual, extract_dates, extract_categorical
# this styling is purely my preference
# less chartjunk
sns.set_context('notebook', font_scale=1.5, rc={'line.linewidth': 2.5})
sns.set(style='ticks', palette='Set2')

RANDOM_STATE = 123

In [55]:
# extract data, features and expected values
raw_data = pd.read_json('../data/raw/scrape-results.json')
X, y = extract_examples(raw_data['data'])
X.head()

Unnamed: 0,AantalBadkamers,AantalKamers,AantalWoonlagen,Aanvaarding,Adres,AfgekochtDatum,BalkonDakterras,BijdrageVVE,Bijzonderheden,Bouwjaar,...,SoortWoning,TuinLigging,Verwarming,VolledigeOmschrijving,Voorzieningen,WGS84_X,WGS84_Y,WarmWater,WoonOppervlakte,Woonoppervlakte
0,2.0,6.0,3 woonlagen,In overleg,Nico Jessekade 27,/Date(2742764400000+0100)/,,,,2008,...,"eengezinswoning, geschakelde woning",gelegen op het noordoosten,stadsverwarming,**PERFECT FAMILIEHUIS VAN CIRCA 157 M² WAAR U...,mechanische ventilatie en TV kabel,4.999718,52.35061,centrale voorziening,157.0,157.0
1,1.0,4.0,2 woonlagen en een zolder,In overleg,Bombraak 31,/Date(2302210800000+0100)/,,,,1993,...,"eengezinswoning, tussenwoning",gelegen op het zuiden,C.V.-ketel,Stel je voor: gezellig samen genieten van het ...,"alarminstallatie, buitenzonwering, jacuzzi, me...",4.891165,52.42331,C.V.-ketel,147.0,147.0
2,2.0,6.0,3 woonlagen,In overleg,Raphaëlplein 39,,balkon aanwezig,,,1932,...,"herenhuis, hoekwoning",,C.V.-ketel,"Een karatiristiek herenhuis uit de jaren 30, g...","mechanische ventilatie, rolluiken, alarminstal...",4.870361,52.347355,C.V.-ketel,250.0,250.0
3,1.0,6.0,1 woonlaag,In overleg,Stadionweg 198 III,/Date(2699215200000+0200)/,balkon aanwezig,200.0,,1931,...,bovenwoning (appartement),,blokverwarming,Zeer ruim en sfeervol 5-kamerappartement met z...,TV kabel,4.869973,52.346794,centrale voorziening,118.0,118.0
4,1.0,5.0,1 woonlaag,In overleg,Ben van Meerendonkstraat 293,/Date(2796588000000+0200)/,,280.0,,2001-2010,...,bovenwoning (appartement),,stadsverwarming en vloerverwarming geheel,"Ben van Meerendonkstraat 293, 1087 LN AMSTERDA...","mechanische ventilatie, TV kabel en lift",5.011224,52.34952,centrale voorziening,194.0,194.0


In [56]:
X = FeatureExtractor(use_text=True, text_use_long=False).transform(X)
X = Imputer().fit_transform(X)
X = Normalizer().fit_transform(X)
X.shape



(3038, 27449)

In [57]:
# split them, use a 80/20 split
# the random state is preinitialized to have reproductible results
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE)

In [65]:
svr_reg = Lasso(normalize=True, alpha=30)
selector = RFE(svr_reg, step=0.5, verbose=True)
selector = selector.fit(X, y)

# print('TRAINING SCORE:', svr_reg.score(X_train, y_train))
# print('TEST SCORE:', svr_reg.score(X_test, y_test))

Fitting estimator with 27449 features.




Fitting estimator with 13725 features.




In [66]:
selector.estimator.fit(X_train, y_train)

print('TRAINING SCORE:', selector.estimator.score(X_train, y_train))
print('TEST SCORE:', selector.estimator.score(X_test, y_test))

TRAINING SCORE: 0.987356872387
TEST SCORE: -0.129053939943




In [63]:
reg = Lasso(alpha=10)
reg.fit(X_train, y_train)

print('TRAINING SCORE:', reg.score(X_train, y_train))
print('TEST SCORE:',  reg.score(X_test, y_test))

TRAINING SCORE: 0.00259863588088
TEST SCORE: 0.00109169014477


In [53]:
selector.estimator.predict(X_test[:10, :]) - y_test[:10]

1679    181979.489964
319     148943.247745
673    -148443.441687
601     229988.927356
1079    -17210.495915
1806    540557.930926
2792      6817.968917
2197     96385.148466
597     -21456.086643
1067    174715.897513
Name: Koopprijs, dtype: float64

In [49]:
y_test[:10]

1679     459800.0
319      525000.0
673      399000.0
601     2250000.0
1079     470000.0
1806      21000.0
2792     190000.0
2197     735000.0
597      398000.0
1067     885000.0
Name: Koopprijs, dtype: float64

In [None]:
train_sizes, train_scores, valid_scores = model_selection.learning_curve(
     SVR(kernel='rbf'), X, y, train_sizes=[50, 80, 110, 500, 1000, 2000], cv=5)

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [None]:
plot_learning_curve(SVR(kernel="poly"), 'lc', X_train, y_train, cv=5, n_jobs=4, train_sizes=[50, 80, 110, 500, 1000, 1900])

In [None]:
y_train.median()