### =====================================================================
### IMPORTACIÓN GENERAL DE LA INFORMACIÓN.
### =====================================================================

In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
import descartes
import geopandas as gpd
import json
import requests
import geocoder

# Random Forest.
from sklearn.ensemble import RandomForestRegressor
from shapely.geometry import Point, Polygon
from urllib2 import urlopen

# XGBoost.
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)

### =====================================================================
### ALGORITMOS DE MACHINE LEARNING:
### =====================================================================

In [2]:
# LECTURAS DE CSV YA PROCESADOS.
train = pd.read_csv('DATA/train_procesado.csv')
test = pd.read_csv('DATA/test_procesado.csv')

In [3]:
Identificador = pd.DataFrame()
Identificador['id'] = test['id']

In [4]:
train = train.drop('id', axis = 1)
test = test.drop('id', axis = 1)

train = train.drop('mean_2016', axis = 1)
test = test.drop('mean_2016', axis = 1)

train = train.drop('median_2016', axis = 1)
test = test.drop('median_2016', axis = 1)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240000 entries, 0 to 239999
Columns: 187 entries, habitaciones to mean_2016_agrupado_9
dtypes: int64(187)
memory usage: 342.4 MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 186 entries, habitaciones to mean_2016_agrupado_9
dtypes: int64(186)
memory usage: 85.1 MB


In [7]:
# Resto de los labels.
Y = np.array(train['precio'])
X = train.drop('precio', axis = 1)

### =====================================================================
### XGBoost.
### =====================================================================

In [8]:
## MODELO ENSAMBLE 1.
trainX_M1, testX_M1, trainY_M1, testY_M1 = train_test_split(X, Y, test_size=0.3, random_state=45)
data_dmatrix_inst = xgb.DMatrix(data=trainX_M1,label=trainY_M1)
model1 = xgb.XGBRegressor(objective = 'reg:linear', 
                          n_estimators = 500,
                          min_child_weight = 5,
                          learning_rate = 0.05017181127931773,
                          gamma = 10,
                          reg_lambda = 3,
                          max_depth = 9,
                          colsample_bytree = 0.7585033814547916, 
                          subsample_bytree = 0.9779760690574663)
model1.fit(trainX_M1,trainY_M1)
pred_M1 = model1.predict(testX_M1)

In [9]:
# MODELO ENSAMBLE 2.
trainX_M2, testX_M2, trainY_M2, testY_M2 = train_test_split(X, Y, test_size=0.3, random_state=40)
data_dmatrix_inst = xgb.DMatrix(data=trainX_M2,label=trainY_M2)
model2 = xgb.XGBRegressor(objective = 'reg:linear', 
                          n_estimators = 500,
                          min_child_weight = 5,
                          learning_rate = 0.1,
                          gamma = 9,
                          reg_lambda = 2,
                          max_depth = 7,
                          colsample_bytree = 0.6, 
                          subsample_bytree = 0.8)
model2.fit(trainX_M2,trainY_M2)
pred_M2 = model2.predict(testX_M2)

In [10]:
# MODELO ENSAMBLE 3.
trainX_M3, testX_M3, trainY_M3, testY_M3 = train_test_split(X, Y, test_size=0.3, random_state=42)
data_dmatrix_inst = xgb.DMatrix(data=trainX_M3,label=trainY_M3)
model3 = xgb.XGBRegressor(objective = 'reg:linear', 
                          n_estimators = 500,
                          min_child_weight = 5,
                          learning_rate = 0.05017181127931773,
                          gamma = 9,
                          reg_lambda = 2,
                          max_depth = 6,
                          colsample_bytree = 0.7585033814547916, 
                          subsample_bytree = 0.9779760690574663)
model3.fit(trainX_M3,trainY_M3)
pred_M3 = model3.predict(testX_M3)

In [11]:
# MODELO ENSAMBLE 4.
trainX_M4, testX_M4, trainY_M4, testY_M4 = train_test_split(X, Y, test_size=0.3, random_state=40)
data_dmatrix_inst = xgb.DMatrix(data=trainX_M4,label=trainY_M4)
model4 =  RandomForestRegressor(n_estimators = 200, random_state = 100)
model4.fit(trainX_M4,trainY_M4)
pred_M4 = model4.predict(testX_M4)

In [12]:
# MODELO ENSAMBLE 5.
trainX_M5, testX_M5, trainY_M5, testY_M5 = train_test_split(X, Y, test_size=0.3, random_state=40)
data_dmatrix_inst = xgb.DMatrix(data=trainX_M5,label=trainY_M5)
model5 = xgb.XGBRegressor(objective = 'reg:linear', 
                          n_estimators = 500,
                          min_child_weight = 5,
                          learning_rate = 0.05017181127931773,
                          gamma = 9,
                          reg_lambda = 2,
                          max_depth = 6,
                          colsample_bytree = 0.7585033814547916, 
                          subsample_bytree = 0.9779760690574663)
model5.fit(trainX_M5,trainY_M5)
pred_M5 = model5.predict(testX_M5)

In [13]:
y_pred=(pred_M1+pred_M2+pred_M3+pred_M4+pred_M5)/5
testY = (testY_M1+testY_M2+testY_M3+testY_M4+testY_M5)/5
# y_pred = xg_reg.predict(testX)

In [14]:
# y_pred = xg_reg.predict(testX)
# Calculamos el error absoluto.
errors = abs(y_pred - testY)
# Imprimimos el error.
print('Error:', round(np.mean(errors), 2), 'grados.')

('Error:', 343031.81, 'grados.')


In [15]:
# Calculamos el porcentaje de error.
mape = 100 * (errors / testY)
# Calculate la precisión.
accuracy = 100 - np.mean(mape)
print('Precision:', round(accuracy, 2), '%.')

('Precision:', 85.8, '%.')


In [16]:
#prediccion = xg_reg.predict(test)

pred_M1_f = model1.predict(test)
pred_M2_f = model2.predict(test)
pred_M3_f = model3.predict(test)
pred_M4_f = model4.predict(test)
pred_M5_f = model5.predict(test)

#prediccion = np.array([])
#for i in range(0,len(test)):
#    prediccion = np.append(prediccion, mode([pred_M1_f[i], pred_M2_f[i], pred_M3_f[i], pred_M4_f[i], pred_M5_f[i]]))

prediccion =(pred_M1_f+pred_M2_f+pred_M3_f+pred_M4_f+pred_M5_f)/5

In [17]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO!
## =================================================================================================
submission = pd.DataFrame({ 'id': Identificador['id'], 'target': prediccion })
submission.to_csv("SUBMITS/012_G34_PrecioAgrupado_Ensambles_02.csv", index=False)