In [20]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None) 
pd.set_option("display.max_rows", None)

model_var = ['Year','HHHolidays_Yes', 'HHFood_Yes', 'HHReserves_Yes', 'HHComputer_Yes', 'HHCar_Yes', 'HousingCost_HighImpactHH',
             'HousingCost_MediumImpactHH', 'HHHeath_Yes', 'vhPobreza_vhPobreza_Yes', 'vhMATDEP_vhMATDEP_Yes','vhRentaa',
             'CHealth','AREMonth', 'CrConditions_NChronic','HLimitations_NoLimited','LifeSatisfaction0','LifeSatisfaction2']


df = pd.read_csv('./Files/ECV_2004_2018.csv.gz', sep = ';', compression='gzip', usecols = model_var).dropna()
df_model = df[(df['Year'] == 2018) | (df['Year'] == 2013)]

df_model.head()

Unnamed: 0,Year,AREMonth,vhRentaa,CHealth,HHHolidays_Yes,HHFood_Yes,HHReserves_Yes,HHComputer_Yes,HHCar_Yes,HousingCost_HighImpactHH,HousingCost_MediumImpactHH,HHHeath_Yes,vhPobreza_vhPobreza_Yes,vhMATDEP_vhMATDEP_Yes,CrConditions_NChronic,HLimitations_NoLimited,LifeSatisfaction0,LifeSatisfaction2
267835,2013,5.0,88450.0,1.0,1,1,1,1,1,1,0,1,0,0,1,1,8.0,8.0
267836,2013,5.0,88450.0,2.0,1,1,1,1,1,1,0,1,0,0,1,1,8.0,8.0
267837,2013,5.0,88450.0,2.0,1,1,1,1,1,1,0,1,0,0,1,1,6.5,6.72817
267838,2013,3.0,30558.69,1.0,0,1,1,1,1,0,1,1,0,0,1,1,8.5,8.540183
267839,2013,3.0,30558.69,2.0,0,1,1,1,1,0,1,1,0,0,1,1,7.5,7.996524


# Librerías

In [5]:
# Modelos
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Métricas
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Selección de variables
import statsmodels.api as sm

# Hyperparameter tunning 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Validación 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

# Variables 

In [21]:
# Variables
X = df_model[['HHHolidays_Yes', 'HHFood_Yes', 'HHReserves_Yes', 'HHComputer_Yes', 'HHCar_Yes', 'HousingCost_HighImpactHH',
              'HousingCost_MediumImpactHH', 'HHHeath_Yes', 'vhPobreza_vhPobreza_Yes', 'vhMATDEP_vhMATDEP_Yes','vhRentaa',
              'CrConditions_NChronic','HLimitations_NoLimited','CHealth','AREMonth']]

# variable output con media aritmética
y1 = df_model['LifeSatisfaction0']
# variable output con predominio de variables de satisfacción con la vida
y2 = df_model['LifeSatisfaction2']

# Modelos

In [22]:
# Train - Test Split y1
X_train1, X_test1, y_train1, y_test1 = train_test_split(
    X,y1, test_size = 0.2, random_state = 42)
# Train - Test Split y2
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X,y2, test_size = 0.2, random_state = 42)

* ### Linear Regression

#### a) Variable y1

In [23]:
# Instancia
reg1 = LinearRegression()
# Fit del modelo
reg1.fit(X_train1, y_train1)
# Predicción de los valores en test
pred1=reg1.predict(X_test1)

#### b) Variable y2

In [24]:
# Instancia
reg2 = LinearRegression()
# Fit del modelo
reg2.fit(X_train2, y_train2)
# Predicción de los valores en test
pred2=reg2.predict(X_test2)

* ### Kneighbors

#### a) Variable y1

In [25]:
# Instancia
regk_1 = KNeighborsRegressor(n_neighbors=298)

# Fit del modelo
regk_1.fit(X_train1, y_train1)

# Predicción de los valores en test
predK1=regk_1.predict(X_test1)

#### b) Variable y2

In [26]:
# Instancia
regk_2 = KNeighborsRegressor(n_neighbors=298)

# Fit del modelo
regk_2.fit(X_train2, y_train2)

# Predicción de los valores en test
predk2=regk_2.predict(X_test2)

* ### Decision Tree

#### a) Variable y1

In [27]:
# Instancia
regd_1 = DecisionTreeRegressor(min_samples_split = 6,
                             max_depth = 12,
                             min_samples_leaf = 4)

# Fit del modelo
regd_1.fit(X_train1, y_train1)

# Predicción de los valores en test
predd1=regd_1.predict(X_test1)

#### b) Variable y2

In [28]:
# Instancia
regd_2 = DecisionTreeRegressor(min_samples_split = 6,
                             max_depth = 11,
                             min_samples_leaf = 12)

# Fit del modelo
regd_2.fit(X_train2, y_train2)

# Predicción de los valores en test
predd2=regd_2.predict(X_test2)

* ### Random Forest

#### a) Variable y1

In [29]:
# Instancia
rdfreg1 = RandomForestRegressor(n_estimators = 100, 
                               min_samples_split = 30, 
                               min_samples_leaf = 60, 
                               max_depth = 10)
# Fit del modelo
rdfreg1.fit(X_train1, y_train1)

# Predicción de los valores en test
predrdf1=rdfreg1.predict(X_test1)

#### b) Variable y2

In [30]:
rdfreg2 = RandomForestRegressor(n_estimators = 100, 
                               min_samples_split = 30, 
                               min_samples_leaf = 60, 
                               max_depth = 10)
# Fit del modelo
rdfreg2.fit(X_train2, y_train2)

# Predicción de los valores en test
predrdf2=rdfreg2.predict(X_test2)

# Resultados

In [31]:
# Regresión Lineal variable 1
print('Resultados Regresión Lineal con variable de Satisfacción 0:')
print('MAPE: ' + str(np.mean(np.abs(y_test1-pred1))))
print('MAE: ' + str(mean_absolute_error(y_test1,pred1)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(reg1.predict((X_test1)),y_test1))))
print('correlation coefficient: ' + str(np.corrcoef(reg1.predict(X_test1), y_test1)[0][1]))
print('R2_score: ' + str(r2_score(y_test1,pred1,multioutput='variance_weighted')))
print('-------------')
# Regresión Lineal variable 2 
print('Resultados Regresión Lineal con variable de Satisfacción 2:')
print('MAPE: ' + str(np.mean(np.abs(y_test2-pred2))))
print('MAE: ' + str(mean_absolute_error(y_test2,pred2)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(reg2.predict((X_test2)),y_test2))))
print('correlation coefficient: ' + str(np.corrcoef(reg2.predict(X_test2), y_test2)[0][1]))
print('R2_score: ' + str(r2_score(y_test2,pred2,multioutput='variance_weighted')))
print('----------------------------')
print('----------------------------')
# Kneighbors variable 1
print('Resultados Kneighbors con variable de Satisfacción 0:')
print('MAPE: ' + str(np.mean(np.abs(y_test1-predK1))))
print('MAE: ' + str(mean_absolute_error(y_test1,predK1)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(regk_1.predict((X_test1)),y_test1))))
print('-------------')
# Kneighbors variable 2
print('Resultados Kneighbors con variable de Satisfacción 1:')
print('MAPE: ' + str(np.mean(np.abs(y_test2-predk2))))
print('MAE: ' + str(mean_absolute_error(y_test2,predk2)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(regk_2.predict((X_test2)),y_test2))))
print('----------------------------')
print('----------------------------')
# Decision tree variable 1
print('Resultados Decision Tree con variable de Satisfacción 0:')
print('MAPE: ' + str(np.mean(np.abs(y_test1-predd1))))
print('MAE: ' + str(mean_absolute_error(y_test1,predd1)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(regd_1.predict((X_test1)),y_test1))))
print('-------------')
# Decision tree variable 2
print('Resultados Decision Tree con variable de Satisfacción 1:')
print('MAPE: ' + str(np.mean(np.abs(y_test2-predd2))))
print('MAE: ' + str(mean_absolute_error(y_test2,predd2)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(regd_2.predict((X_test2)),y_test2))))
# Random Forest variable 1
print('----------------------------')
print('----------------------------')
print('Resultados Random Forest con variable de Satisfacción 0:')
print('MAPE: ' + str(np.mean(np.abs(y_test1-predrdf1))))
print('MAE: ' + str(mean_absolute_error(y_test1,predrdf1)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(rdfreg1.predict((X_test1)),y_test1))))
print('-------------')
# Random Forest variable 1
print('Resultados Random Forest con variable de Satisfacción 1:')
print('MAPE: ' + str(np.mean(np.abs(y_test2-predrdf2))))
print('MAE: ' + str(mean_absolute_error(y_test2,predrdf2)))
print('RMSE: ' + str(np.sqrt(mean_squared_error(rdfreg2.predict((X_test2)),y_test2))))

Resultados Regresión Lineal con variable de Satisfacción 0:
MAPE: 0.9865638849308722
MAE: 0.9865638849308732
RMSE: 1.27069016481389
correlation coefficient: 0.5310483664550603
R2_score: 0.28126389047407496
-------------
Resultados Regresión Lineal con variable de Satisfacción 2:
MAPE: 0.9781555415583661
MAE: 0.9781555415583668
RMSE: 1.2707319416569327
correlation coefficient: 0.5648195017266774
R2_score: 0.31841627632611713
----------------------------
----------------------------
Resultados Kneighbors con variable de Satisfacción 0:
MAPE: 1.126348775164632
MAE: 1.126348775164633
RMSE: 1.4490189288398527
-------------
Resultados Kneighbors con variable de Satisfacción 1:
MAPE: 1.1352512101996934
MAE: 1.135251210199695
RMSE: 1.474600701264951
----------------------------
----------------------------
Resultados Decision Tree con variable de Satisfacción 0:
MAPE: 1.0239223160037918
MAE: 1.0239223160037925
RMSE: 1.3181105901421115
-------------
Resultados Decision Tree con variable de Sati

# Feature Selection 

# Hyperparameter tuning

### Kneighbors

##### Kneighbors y1

In [None]:
%%time
regk1 = GridSearchCV(KNeighborsRegressor(),
                  param_grid={"n_neighbors":np.arange(4,300)},
                  cv=5,
                  scoring="neg_mean_absolute_error")


regk1.fit(X,y1)

print(regk1.best_params_)

##### Kneighbors y2

In [None]:
%%time
regk2 = GridSearchCV(KNeighborsRegressor(),
                  param_grid={"n_neighbors":np.arange(4,300)},
                  cv=5,
                  scoring="neg_mean_absolute_error")


regk2.fit(X,y2)

print(regk2.best_params_)

### Decision Tree

##### Decision Tree y1

In [None]:
%%time

regd1 = GridSearchCV(DecisionTreeRegressor(),
                  param_grid={"min_samples_split":np.arange(4,15),
                              "max_depth":np.arange(4,15),
                             'min_samples_leaf':np.arange(4,15)},
                  cv=5,
                  scoring="neg_mean_absolute_error")


regd1.fit(X,y1)

print(regd1.best_params_)

##### Decision Tree y2

In [None]:
%%time

regd2 = GridSearchCV(DecisionTreeRegressor(),
                  param_grid={"min_samples_split":np.arange(4,15),
                              "max_depth":np.arange(4,15),
                             'min_samples_leaf':np.arange(4,15)},
                  cv=5,
                  scoring="neg_mean_absolute_error")


regd2.fit(X,y2)

print(regd2.best_params_)

### Random Forest

In [19]:
# Hiperparametros
# n_estimators - number of intances - Number of trees
n_estimators = [50,100,200,400,600]

# max_depth: Maximun number of levels in the tree
max_depth = [10,20,30,40,50]

# min_sample_split: Min number of samples to split a node
min_samples_split = [20,30,40,50,60]

# min_sample_leaf: Min number of samples at each leaf node
min_samples_leaf = [20,30,40,50,60]

# min_sample_leaf: Min number of samples at each leaf node
grid_param = {'n_estimators':n_estimators, 'max_depth':max_depth, 'min_samples_split': min_samples_split,
              'min_samples_leaf':min_samples_leaf}

##### Random Forest y1

In [None]:
%%time

rscv1 = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = grid_param, 
                        n_iter = 100, cv = 5, verbose = 2, random_state = 33, 
                        n_jobs = -1)

rscv1.fit(X,y1)

print(rscv1.best_params_)

##### Random Forest y2

In [None]:
%%time

rscv1 = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions = grid_param, 
                        n_iter = 100, cv = 5, verbose = 2, random_state = 33, 
                        n_jobs = -1)

rscv2.fit(X,y2)

print(rscv2.best_params_)