In [9]:
import warnings
#warnings.filterwarnings('ignore')
warnings.filterwarnings(action='once')

from skimage.io import imread
from skimage.transform import resize
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.layers.normalization import BatchNormalization

from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Activation,  Merge, Reshape, Dropout, Flatten
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.optimizers import SGD
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.optimizers import Adagrad
from keras.optimizers import Adadelta
from keras.models import Model


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils.validation import column_or_1d

In [2]:
def lasso_fit(X, y, alphas):
    pipeline = Pipeline([('scaler', StandardScaler()), ('normalizer', Normalizer())])
    X = pd.DataFrame(pipeline.fit_transform(X))
    y = np.log1p(y)
        
    error = list(map(abs, 
                     [cross_val_score(Lasso(alpha=alpha, normalize=True), X, y,
                      scoring='neg_mean_absolute_error', cv=5).mean() for alpha in alphas]
                    )
                )
    plt.plot(alphas, error)
    plt.show()
    return list(zip(error, alphas))


def regressor(X, y, model, n):
    
    # use all features without feature selection
    if n == -1:
        preproc_pipe = [('scaler', StandardScaler()), # column-wise
                    ('normalizer', Normalizer())] # row-wise
    else:
        preproc_pipe = [('scaler', StandardScaler()),
                    ('lsa', TruncatedSVD(n)),
                    ('normalizer', Normalizer())]
        
    preproc_pipe = Pipeline(preproc_pipe)
    X = pd.DataFrame(preproc_pipe.fit_transform(X))
    
    pipeline = []
    if model == 'GB':
        pipeline.append(('estimator', GradientBoostingRegressor(n_estimators = 300)))
    elif model == 'LASSO':
        
        pipeline.append(('estimator', Lasso(alpha=2e-5, max_iter=2000)))
    else:
        print('Error Model')
        return None
    pipeline = Pipeline(pipeline) 
    
    MAE_train = []
    MAE_test = []
    
    kf = KFold(n_splits=5, shuffle = True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        pipeline.fit(X_train, np.log1p(y_train))
        y_train_predict = np.expm1(pipeline.predict(X_train))
        y_test_predict = np.expm1(pipeline.predict(X_test))
        
        MAE_train.append(mean_absolute_error(y_train, y_train_predict))
        MAE_test.append(mean_absolute_error(y_test, y_test_predict))      

    return np.mean(MAE_test)


## Load the images

In [3]:
import os
import numpy as np

def load_img(dir_name):
    X = []
    y = []
    for file_name in os.listdir(dir_name):
        house_id, house_price, _, image_format = file_name.split(".")
        if(image_format != "jpg"):
            continue
        try:
            house_price = float(house_price)
        except ValueError:
            continue
        img = image.load_img(dir_name+"/"+file_name, target_size=(400, 267))
        data = np.expand_dims(image.img_to_array(img), axis=0)
        X.append(data[0])
        y.append(house_price)
    return np.array(X), np.array(y)
            
X, y = load_img("data/images")

In [4]:
print(X.shape)
print(y.shape)

(4171, 400, 267, 3)
(4171,)


## Use VGG16  

In [5]:
base_model = VGG16(weights='imagenet', include_top=False, input_shape = (400, 267, 3))
for layer in base_model.layers:
    layer.trainable = False
    
top_model = Sequential()
top_model.add(Flatten(input_shape=base_model.output_shape[1:]))
model_vgg = Model(inputs=base_model.input, outputs=top_model(base_model.output))

new_vgg_features = model_vgg.predict(X)

In [6]:
# train_X = pd.DataFrame(new_vgg_features[:int(len(new_vgg_features)*.7)])
# test_X = pd.new_vgg_features[int(len(new_vgg_features)*.7):]

# train_y = y[:int(len(new_vgg_features)*.7)]
# test_y = y[int(len(new_vgg_features)*.7):]

X = pd.DataFrame(new_vgg_features)
y = pd.DataFrame(y, columns=['price'])

## Use feature extracted by VGG16 to do prediction

In [7]:
ns = range(40, 60, 2)
gb_tests = []
lasso_tests = []

for n in ns:
    gb_tests.append((regressor(X, y, 'GB', n), n))
    lasso_tests.append((regressor(X, y, 'LASSO', n), n))
gb_tests.sort()
print(gb_tests[:10])
lasso_tests.sort()
print(lasso_tests[:10])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[(90094.57010306857, 46), (90357.43022657008, 56), (90384.15494233165, 54), (90616.54388442481, 48), (90697.45038827912, 58), (90883.9916594375, 42), (90959.31764030733, 52), (91084.61959061347, 44), (91275.23135784877, 50), (91305.86568410855, 40)]
[(90339.28771158394, 58), (90351.82248387932, 52), (90373.80919018356, 54), (90452.41794065069, 50), (90534.79562487648, 56), (91502.60373323166, 44), (91654.19787346669, 46), (91773.66422330088, 48), (92046.71457099442, 42), (92091.2856252524, 40)]
