### =====================================================================
### IMPORTACIÓN GENERAL DE LA INFORMACIÓN.
### =====================================================================

In [1]:
## IMPORTACIÓN GENERAL DE LIBRERIAS Y VISUALIZACIÓN DE DATOS (matplotlib y seaborn)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as DT
import warnings
import descartes
import geopandas as gpd
import json
import requests
import geocoder

from sklearn.ensemble import RandomForestRegressor
from shapely.geometry import Point, Polygon
from urllib2 import urlopen

%matplotlib inline
warnings.filterwarnings('ignore')
plt.style.use('default') 
sns.set(style="whitegrid") 
plt.rcParams['figure.figsize'] = (15, 10)

### =====================================================================
### ALGORITMOS DE MACHINE LEARNING:
### =====================================================================

### =====================================================================
### RANDOM FOREST.
### =====================================================================

In [2]:
# LECTURAS DE CSV YA PROCESADOS.
train = pd.read_csv('DATA/train.csv')
test = pd.read_csv('DATA/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240000 entries, 0 to 239999
Columns: 149 entries, id to 2016
dtypes: float64(12), int64(137)
memory usage: 272.8 MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 148 entries, id to garage
dtypes: float64(11), int64(137)
memory usage: 67.7 MB


In [5]:
# Segmentamos una parte para entrenar y constatar.
train_2016 = train.loc[train.anio == 2016]
train_PREV = train.loc[train.anio < 2016]

In [6]:
# Label a predecir.
labels_2016 = np.array(train_2016['precio'])
labels_PREV = np.array(train_PREV['precio'])

In [7]:
# Resto de los labels.
train_2016 = train_2016.drop('precio', axis = 1)
train_PREV = train_PREV.drop('precio', axis = 1)
train_2016 = train_2016.drop('anio', axis = 1)
train_PREV = train_PREV.drop('anio', axis = 1)
test = test.drop('anio', axis = 1)

In [8]:
# Lista de columnas.
feature_list = list(train_PREV.columns)

In [9]:
# Numpy array
train_2016 = np.array(train_2016)
train_PREV = np.array(train_PREV)

In [10]:
# Observamos lo que nos queda en cada parte.
print('Training Features Shape:', train_PREV.shape)
print('Training Labels Shape:', labels_PREV.shape)
print('Testing Features Shape:', train_2016.shape)
print('Testing Labels Shape:', labels_2016.shape)

('Training Features Shape:', (145962, 147))
('Training Labels Shape:', (145962,))
('Testing Features Shape:', (94038, 147))
('Testing Labels Shape:', (94038,))


In [11]:
# Armamos el regresor con parámetros por defecto.
rf = RandomForestRegressor(n_estimators = 150, random_state = 75)
# Entrenamos.
rf.fit(train_PREV, labels_PREV);

In [12]:
# Hacemos una predicción.
predictions = rf.predict(train_2016)
# Calculamos el error absoluto.
errors = abs(predictions - labels_2016)
# Imprimimos el error.
print('Error:', round(np.mean(errors), 2), 'grados.')

('Error:', 870769.69, 'grados.')


In [13]:
# Calculamos el porcentaje de error.
mape = 100 * (errors / labels_2016)
# Calculate la precisión.
accuracy = 100 - np.mean(mape)
print('Precision:', round(accuracy, 2), '%.')

('Precision:', 67.69, '%.')


In [14]:
prediccion = rf.predict(test)

In [15]:
## =================================================================================================
## ARMAMOS EN BASE A LA PREDICCIÓN QUE TENEMOS UN CSV PARA SUBIR A KAGGLE CON EL FORMATO INDICADO.
## =================================================================================================
submission = pd.DataFrame({ 'id': test['id'], 'target': prediccion })
submission.to_csv("SUBMITS/001_G34_RandomForest.csv", index=False)