In [1]:
import pandas as pd
import numpy as np
import pickle 
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV, BayesianRidge, HuberRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import RobustScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.decomposition import PCA
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from datetime import datetime
import os

In [2]:
#levanto data sets
df = pd.read_csv("../../machine-learning/cleanedData.csv")
y = df.precio

X = pd.read_csv("../../machine-learning/xgboost-x.csv")

In [3]:
aux = X.join(y)
aux = aux.dropna()
aux = aux.sample(n=500, random_state=42)

In [4]:
y_aux = aux.precio
X_aux = aux.drop(['precio'], axis=1, inplace=False)
X_aux.drop(['Unnamed: 0'], axis=1, inplace=True)
X_aux.drop(['id'], axis=1, inplace=True)

In [5]:
y_test = pd.DataFrame(y_aux)

In [6]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_aux, y_aux, test_size=test_size, random_state=seed)
# fit model no training data

In [7]:
def add_more_features(self,df):
		df['patio'] = df.metrostotales - df.metroscubiertos
		df['ambientes'] = df.habitaciones + df.banos + df.garages
		#df['prom_amb'] = df.metroscubiertos / df.ambientes
		#df['construccion_density'] = df.metroscubiertos/df.metrostotales
		return df

In [120]:
def save_prediction(y_test, ids, model):
		final_pred = y_test

#		ids = self.df_test['id'].values
		try:
			os.mkdir('predictions')
		except:
			pass


		submit = pd.DataFrame({'id':ids,'target':final_pred})
		submit.to_csv('predictions/submit-'+model+'.csv',index=False)

In [70]:
def timer(start_time=None):
		if not start_time:
			start_time = datetime.now()
			return start_time
		elif start_time:
			thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
			tmin, tsec = divmod(temp_sec, 60)
			print('Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

## Lasso

In [115]:
def train_LassoCV(data):

    train,validacion = data
    x_tr,y_tr = train
    x_val,y_val = validacion
    #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
    #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

    print('Start training LassoCV...')
    start_time = timer()

    Lasso = LassoCV(
        n_alphas=100,
        cv=10,
        normalize=True,
        verbose=True
    )

    Lasso.fit(x_tr,y_tr)
    print("The R2 is: {}".format(Lasso.score(x_tr,y_tr)))
    print("The alpha choose by CV is:{}".format(Lasso.alpha_))
    timer(start_time)

    print("Making prediction on validation data")
    y_val = y_val#np.expm1(y_val)
    y_val_pred = Lasso.predict(x_val)#np.expm1(Lasso.predict(x_val))
    #np.nan_to_num(X) "replace nan with zero and inf with finite numbers".
    mae = mean_absolute_error(np.nan_to_num(y_val),np.nan_to_num(y_val_pred))

    print("El mean absolute error de es {}".format(mae))
    print('Saving model into a pickle')

    try:
        os.mkdir('pickles')
    except:
        pass

    with open('pickles/LassoCV.pkl','wb') as f:
        pickle.dump(Lasso, f)
            
    return y_val_pred

In [116]:
train = X_train, y_train
test =  X_test, y_test
data = train, test
y_val_pred = train_LassoCV(data)

ids = y_test.index
save_prediction(y_test=y_val_pred, ids=ids, model='Lasso')

Start training LassoCV...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

The R2 is: 0.7416877725912367
The alpha choose by CV is:2320.866146031475
Time taken: 0 hours 0 minutes and 0.93 seconds.
Making prediction on validation data
El mean absolute error de es 578807.0385349854
Saving model into a pickle


...................................................................................................................[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


In [None]:
#
#COMO TRATA LASSO LAS FEATURES QUE ESTAN ENCODEADAS ?? (ciudad, provincia, tipodepropidedad, etc)
#CON DISTINTOS TIPOS DE ENCODING PODRIA MEJORAR LOS RESULTADOS ???
#
https://stats.idre.ucla.edu/spss/faq/coding-systems-for-categorical-variables-in-regression-analysis-2/

In [None]:
#
#fuentes
#
https://stackoverflow.com/questions/24233981/how-does-lassocv-in-scikit-learn-partition-data
https://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html#sphx-glr-auto-examples-linear-model-plot-lasso-model-selection-py
https://scikit-learn.org/stable/auto_examples/exercises/plot_cv_diabetes.html#sphx-glr-auto-examples-exercises-plot-cv-diabetes-py

In [89]:
#NESTED CROSS VALIDATION
k_fold = KFold(3)
alphas = []

print("How much can you trust the selection of alpha?")
print()
print("Alpha parameters maximising the generalization score on different")
print("subsets of the data:")

for k, (train, test) in enumerate(k_fold.split(X_train, y_train)):
    lasso_cv = LassoCV(
        n_alphas=100,
        cv=10,
        normalize=True,
        verbose=False
    )
    lasso_cv.fit(X_train.iloc[train], y_train.iloc[train])
    alphas.append(lasso_cv.alpha_)
    
    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
          format(k, lasso_cv.alpha_, lasso_cv.score(X_train.iloc[test], y_train.iloc[test])))

print("alpha mean: {0:.5f}".format(np.mean(alphas)))

print()
print("Answer: Not very much since we obtained different alphas for different")
print("subsets of the data and moreover, the scores for these alphas differ")
print("quite substantially.")

How much can you trust the selection of alpha?

Alpha parameters maximising the generalization score on different
subsets of the data:
[fold 0] alpha: 3456.25424, score: 0.67086
[fold 1] alpha: 2634.46487, score: 0.40059
[fold 2] alpha: 5703.98459, score: 0.49294
alpha mean: 3931.56790

Answer: Not very much since we obtained different alphas for different
subsets of the data and moreover, the scores for these alphas differ
quite substantially.


# Ridge

In [130]:
def train_ridgeCV(data):

    train,validacion = data
    x_tr,y_tr = train
    x_val,y_val = validacion
    #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
    #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

    print('Start training RidgeCV...')
    start_time = timer()

    ridge = RidgeCV(
        normalize=True,
        alphas=[0.0000999],
        cv=10
    )

    ridge.fit(x_tr,y_tr)
    print("The R2 is: {}".format(ridge.score(x_tr,y_tr)))
    print("The alpha choose by CV is:{}".format(ridge.alpha_))
    timer(start_time)

    print("Making prediction on validation data")
    y_val = y_val#np.expm1(y_val)
    y_val_pred = ridge.predict(x_val)#np.expm1(ridge.predict(x_val))
    #np.nan_to_num(X) "replace nan with zero and inf with finite numbers".
    mae = mean_absolute_error(np.nan_to_num(y_val),np.nan_to_num(y_val_pred))

    print("El mean absolute error de es {}".format(mae))
    print('Saving model into a pickle')

    try:
        os.mkdir('pickles')
    except:
        pass

    with open('pickles/RidgeCV.pkl','wb') as f:
        pickle.dump(ridge, f)
            
    return y_val_pred

In [131]:
train = X_train, y_train
test =  X_test, y_test
data = train, test
y_val_pred = train_ridgeCV(data)

ids = y_test.index
save_prediction(y_test=y_val_pred, ids=ids, model='Ridge')

Start training RidgeCV...
The R2 is: 0.780026576429594
The alpha choose by CV is:9.99e-05
Time taken: 0 hours 0 minutes and 0.11 seconds.
Making prediction on validation data
El mean absolute error de es 690595.3366460925
Saving model into a pickle


In [12]:
#NESTED CROSS VALIDATION
k_fold = KFold(3)
alphas = []
scores = []

print("How much can you trust the selection of alpha?")
print()
print("Alpha parameters maximising the generalization score on different")
print("subsets of the data:")

for k, (train, test) in enumerate(k_fold.split(X_train, y_train)):
    ridge = RidgeCV(
        normalize=True,
        alphas=[0.0000999],
        cv=10
    )
    ridge.fit(X_train.iloc[train], y_train.iloc[train])
    alphas.append(ridge.alpha_)
    
    score = ridge.score(X_train.iloc[test], y_train.iloc[test])
    scores.append(score)

    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
          format(k, ridge.alpha_, score))

print("alpha mean: {0:.5f}".format(np.mean(alphas)))
print("score mean: {0:.5f}".format(np.mean(scores)))

print()
print("Answer: More reliable than Lasso experiment since we obtained same alphas for different")
print("subsets of the data BUT the scores for these alphas differ")
print("quite substantially.")

How much can you trust the selection of alpha?

Alpha parameters maximising the generalization score on different
subsets of the data:
[fold 0] alpha: 0.00010, score: 0.63716
[fold 1] alpha: 0.00010, score: 0.24311
[fold 2] alpha: 0.00010, score: 0.60722
alpha mean: 0.00010
score mean: 0.49583

Answer: More reliable than Lasso experiment since we obtained same alphas for different
subsets of the data BUT the scores for these alphas differ
quite substantially.


# Elastic Net

In [159]:
def train_elasticNetCV(data):

    train,validacion = data
    x_tr,y_tr = train
    x_val,y_val = validacion
    #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
    #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

    print('Start training enetCV...')
    start_time = timer()

    enet = ElasticNetCV(
        normalize=True,
        n_alphas=2000,
        max_iter = 2000,
        cv=10
    )

    enet.fit(x_tr,y_tr)
    print("The R2 is: {}".format(enet.score(x_tr,y_tr)))
    print("The alpha choose by CV is:{}".format(enet.alpha_))
    timer(start_time)

    print("Making prediction on validation data")
    y_val = y_val#np.expm1(y_val)
    y_val_pred = enet.predict(x_val)#np.expm1(ridge.predict(x_val))
    #np.nan_to_num(X) "replace nan with zero and inf with finite numbers".
    mae = mean_absolute_error(np.nan_to_num(y_val),np.nan_to_num(y_val_pred))

    print("El mean absolute error de es {}".format(mae))
    print('Saving model into a pickle')

    try:
        os.mkdir('pickles')
    except:
        pass

    with open('pickles/enetCV.pkl','wb') as f:
        pickle.dump(enet, f)
            
    return y_val_pred

In [158]:
train = X_train, y_train
test =  X_test, y_test
data = train, test
y_val_pred = train_elasticNetCV(data)

ids = y_test.index
save_prediction(y_test=y_val_pred, ids=ids, model='elastic')

Start training enetCV...
The R2 is: 0.0002157436852815442
The alpha choose by CV is:114.97925311512337
Time taken: 0 hours 0 minutes and 10.52 seconds.
Making prediction on validation data
El mean absolute error de es 1114335.4774411486
Saving model into a pickle


In [14]:
#NESTED CROSS VALIDATION
k_fold = KFold(3)
alphas = []
scores = []

print("How much can you trust the selection of alpha?")
print()
print("Alpha parameters maximising the generalization score on different")
print("subsets of the data:")

for k, (train, test) in enumerate(k_fold.split(X_train, y_train)):
    enet = ElasticNetCV(
        normalize=True,
        n_alphas=2000,
        max_iter = 2000,
        cv=10
    )
    enet.fit(X_train.iloc[train], y_train.iloc[train])
    alphas.append(ridge.alpha_)
    
    score = enet.score(X_train.iloc[test], y_train.iloc[test])
    scores.append(score)
    
    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".
          format(k, ridge.alpha_, score))

print("alpha mean: {0:.5f}".format(np.mean(alphas)))
print("scores mean: {0:.5f}".format(np.mean(scores)))

print()
print("Answer: More reliable than Lasso experiment since we obtained same alphas for different")
print("subsets of the data BUT the scores for these alphas differ quite substantially.")
print("")
print("Ridge experiment got a higher mean score value")

How much can you trust the selection of alpha?

Alpha parameters maximising the generalization score on different
subsets of the data:
[fold 0] alpha: 0.00010, score: -0.00289
[fold 1] alpha: 0.00010, score: -0.07631
[fold 2] alpha: 0.00010, score: -0.04488
alpha mean: 0.00010
scores mean: -0.04136

Answer: More reliable than Lasso experiment since we obtained same alphas for different
subsets of the data BUT the scores for these alphas differ quite substantially.

Ridge experiment got a higher mean score value


# ElasticNet  Randomized search

In [142]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import time

In [161]:
param_grid = {
        'normalize': [True],
        'n_alphas': [1000, 2000],
        'max_iter': [1000, 2000]
}

In [162]:
clf = ElasticNetCV()

In [163]:
rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=40,
                            verbose=2, cv=10,
                            scoring='neg_mean_squared_error', random_state=42)

In [164]:
print("Randomized search..")
search_time_start = time.time()
rs_clf.fit(X_train, y_train)
print("Randomized search time:", time.time() - search_time_start)

Randomized search..
Fitting 10 folds for each of 4 candidates, totalling 40 fits
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.3s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.3s remaining:    0.0s


[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.2s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................
[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.4s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................
[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.3s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................
[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.3s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................
[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.4s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................
[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.4s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................
[CV] ..... normalize=True, n_alphas=1000, max_iter=1000, total=   2.2s
[CV] normalize=True, n_alphas=1000, max_iter=1000 ....................
[CV] .

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  2.3min finished


Randomized search time: 141.1281771659851


In [165]:
best_score = rs_clf.best_score_
best_params = rs_clf.best_params_

print("Best score: {}".format(best_score))
print("Best params: ")
for param_name in sorted(best_params.keys()):
    print('%s: %r' % (param_name, best_params[param_name]))

Best score: -2943637415972.0703
Best params: 
max_iter: 1000
n_alphas: 1000
normalize: True
