### =====================================================================
### IMPORTACIÓN GENERAL DE LA INFORMACIÓN.
### =====================================================================

In [16]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
import descartes
import geopandas as gpd
import json
import requests
import geocoder

# Random Forest.
from sklearn.ensemble import RandomForestRegressor
from shapely.geometry import Point, Polygon
from urllib2 import urlopen

# XGBoost.
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)

### =====================================================================
### ALGORITMOS DE MACHINE LEARNING:
### =====================================================================

In [17]:
# LECTURAS DE CSV YA PROCESADOS.
train = pd.read_csv('DATA/train.csv')
test = pd.read_csv('DATA/test.csv')

In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240000 entries, 0 to 239999
Data columns (total 80 columns):
id                               240000 non-null int64
antiguedad                       240000 non-null float64
habitaciones                     240000 non-null float64
garages                          240000 non-null float64
banos                            240000 non-null float64
metroscubiertos                  240000 non-null float64
metrostotales                    240000 non-null float64
gimnasio                         240000 non-null float64
usosmultiples                    240000 non-null float64
piscina                          240000 non-null float64
escuelascercanas                 240000 non-null float64
centroscomercialescercanos       240000 non-null float64
precio                           240000 non-null float64
excelente                        240000 non-null int64
vigilancia                       240000 non-null int64
esquina                          240000 

In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 79 columns):
id                               60000 non-null int64
antiguedad                       60000 non-null float64
habitaciones                     60000 non-null float64
garages                          60000 non-null float64
banos                            60000 non-null float64
metroscubiertos                  60000 non-null float64
metrostotales                    60000 non-null float64
gimnasio                         60000 non-null float64
usosmultiples                    60000 non-null float64
piscina                          60000 non-null float64
escuelascercanas                 60000 non-null float64
centroscomercialescercanos       60000 non-null float64
excelente                        60000 non-null int64
vigilancia                       60000 non-null int64
esquina                          60000 non-null int64
remodelada                       60000 non-null int64
lujo 

In [20]:
train = train[[c for c in train if c not in ['HOSPEDAJE', 'GARAGE']] + ['HOSPEDAJE', 'GARAGE']]

In [21]:
# Segmentamos una parte para entrenar y constatar.
# testX = train.loc[train['2016'] == 1]
# trainX = train.loc[train['2016'] == 0]

In [22]:
# Label a predecir.
# testY = np.array(train_2016['precio'])
# testX = testX.drop('precio', axis = 1)

In [23]:
# Resto de los labels.
Y = np.array(train['precio'])
X = train.drop('precio', axis = 1)

In [24]:
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=42)

### =====================================================================
### XGBoost.
### =====================================================================

In [25]:
data_dmatrix_inst = xgb.DMatrix(data=trainX,label=trainY)
#X_train_inst, X_test_inst, y_train_inst, y_test_inst = train_test_split(trainX, trainY, test_size=0.25, random_state=100)
xg_reg = xgb.XGBRegressor(objective = 'reg:linear', 
                          colsample_bytree = 0.3, 
                          learning_rate = 0.05,
                          max_depth = 10, 
                          alpha = 7, 
                          n_estimators = 500)

In [26]:
xg_reg.fit(trainX,trainY)

XGBRegressor(alpha=7, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [27]:
y_pred = xg_reg.predict(testX)
# Calculamos el error absoluto.
errors = abs(y_pred - testY)
# Imprimimos el error.
print('Error:', round(np.mean(errors), 2), 'grados.')

('Error:', 464511.21, 'grados.')


In [28]:
# Calculamos el porcentaje de error.
mape = 100 * (errors / testY)
# Calculate la precisión.
accuracy = 100 - np.mean(mape)
print('Precision:', round(accuracy, 2), '%.')

('Precision:', 78.13, '%.')


In [29]:
prediccion = xg_reg.predict(test)

In [30]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO!
## =================================================================================================
submission = pd.DataFrame({ 'id': test['id'], 'target': prediccion })
submission.to_csv("SUBMITS/007_G34_XGBoost.csv", index=False)