In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import seaborn as sns
import datetime as dat
import matplotlib as mplt
import datetime as dt
import xgboost as xgb
import functools as functools
%matplotlib inline
sns.set()

In [2]:
auctions = pd.read_csv("data/auctions.csv", dtype={'date': 'str','device_id': 'category', 'ref_type_id': 'category','source_id':'category'}, parse_dates = ['date'])
auctions.head(5)

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-23 18:58:00.842116,2564673204772915246,1,0
1,2019-04-23 18:58:01.530771,4441121667607578179,7,0
2,2019-04-23 18:58:01.767562,7721769811471055264,1,0
3,2019-04-23 18:58:02.363468,6416039086842158968,1,0
4,2019-04-23 18:58:02.397559,1258642015983312729,1,0


In [3]:
auctions.dtypes

date           datetime64[ns]
device_id            category
ref_type_id          category
source_id            category
dtype: object

In [4]:
#Chequeo si hay valores nulos
auctions.isnull().any()

date           False
device_id      False
ref_type_id    False
source_id      False
dtype: bool

In [5]:
#Los ordeno por fecha para mas adelante poder armar las "ventanas"
auctions = auctions.sort_values(by='date')
auctions.head()

Unnamed: 0,date,device_id,ref_type_id,source_id
10129864,2019-04-18 00:00:00.015050,1826643666390887030,7,0
25285906,2019-04-18 00:00:00.029014,7037174172278258682,1,0
3946062,2019-04-18 00:00:00.057540,3392065368947589877,1,1
3946063,2019-04-18 00:00:00.126828,1228982273563226229,1,1
24607726,2019-04-18 00:00:00.132510,4123059034628125459,1,8


In [6]:
#Cuento cantidad de aparaciones hasta la i-esima entrada
auctions['entradas_previas'] = auctions.groupby('device_id').cumcount()
auctions['entradas_previas'].head()

10129864    0
25285906    0
3946062     0
3946063     0
24607726    0
Name: entradas_previas, dtype: int64

In [7]:
#Armo la ventana numero 1
fecha_minima = pd.to_datetime('2019-04-18 00:00:00')
fecha_tope = pd.to_datetime('2019-04-21 00:00:00')

auctions_ventana1 = auctions.loc[auctions['date'] < fecha_tope].copy(deep = False)

#Agrego el target
#la division es para pasar la diferencia de tiempso de timedelta a float de dias
auctions_ventana1['tiempo_hasta_aparicion'] = (auctions_ventana1['date']-fecha_minima)/ np.timedelta64(1, 's')

#Agrego algunos features
auctions_ventana1['std_hour'] = auctions_ventana1['date'].dt.hour
auctions_ventana1['mean_hour'] = auctions_ventana1['date'].dt.hour
auctions_ventana1['std_dia'] = auctions_ventana1['date'].dt.day
auctions_ventana1['mean_dia'] = auctions_ventana1['date'].dt.day

In [8]:
auctions_ventana1_reducido = auctions_ventana1.groupby(['device_id']).agg({'date' : min,\
                                                                         'std_hour' : lambda x: np.std(x),\
                                                                         'mean_hour' : 'mean',\
                                                                         'std_dia' : lambda x: np.std(x),\
                                                                         'mean_dia' : 'mean',\
                                                                         'tiempo_hasta_aparicion' : 'min',\
                                                                         'entradas_previas' : min})\
                                                                            .reset_index('device_id')
#entradas previas no se debe usar para predecir en esta ventana, sino para la siguiente              
auctions_ventana1_reducido=auctions_ventana1_reducido.dropna()
auctions_ventana1_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
1,1000400432115255220,2019-04-19 19:13:44.164198,0.0,19.0,0.0,19.0,155624.164198,0.0
2,1001123163431776865,2019-04-18 00:24:02.033706,5.157447,18.25,0.806548,19.235294,1442.033706,0.0
3,1001144380199556647,2019-04-18 00:32:57.283070,5.108182,16.217949,0.778681,18.782051,1977.28307,0.0
5,1001650136929210538,2019-04-18 00:05:54.783433,7.878987,11.998648,0.726314,18.910751,354.783433,0.0
6,1001893648440705480,2019-04-18 01:38:20.040410,8.492952,10.082192,0.999906,18.986301,5900.04041,0.0


In [9]:
#Armo la ventana numero 2
fecha_minima = pd.to_datetime('2019-04-21 00:00:00')
fecha_tope = pd.to_datetime('2019-04-24 00:00:00')

auctions_ventana2 = auctions.loc[((auctions['date'] > fecha_minima) & (auctions['date'] < fecha_tope) )].copy(deep=False)

#Agrego el target
#la division es para pasar la diferencia de tiempso de timedelta a float de dias
auctions_ventana2['tiempo_hasta_aparicion'] = (auctions_ventana2['date']-fecha_minima)/ np.timedelta64(1, 's')

#Agrego algunos features
auctions_ventana2['std_hour'] = auctions_ventana2['date'].dt.hour
auctions_ventana2['mean_hour'] = auctions_ventana2['date'].dt.hour
auctions_ventana2['std_dia'] = auctions_ventana2['date'].dt.day
auctions_ventana2['mean_dia'] = auctions_ventana2['date'].dt.day

In [10]:
auctions_ventana2_reducido = auctions_ventana2.groupby(['device_id']).agg({'date' : min,\
                                                                         'std_hour' : lambda x: np.std(x),\
                                                                         'mean_hour' : 'mean',\
                                                                         'std_dia' : lambda x: np.std(x),\
                                                                         'mean_dia' : 'mean',\
                                                                         'tiempo_hasta_aparicion' : 'min',\
                                                                         'entradas_previas' : min})\
                                                                            .reset_index('device_id')
#entradas previas no se debe usar para predecir en esta ventana, sino para la siguiente              
auctions_ventana2_reducido=auctions_ventana2_reducido.dropna()
auctions_ventana2_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
0,100033926124811452,2019-04-23 03:10:50.587009,0.0,3.0,0.0,23.0,184250.587009,0.0
1,1000400432115255220,2019-04-21 02:41:03.200096,6.9052,18.102564,0.923077,21.615385,9663.200096,1.0
2,1001123163431776865,2019-04-21 01:41:47.707241,7.428712,10.653061,0.781928,21.795918,6107.707241,68.0
3,1001144380199556647,2019-04-21 15:12:55.733409,7.204915,16.72067,0.83549,22.01676,54775.733409,78.0
4,1001358436431521709,2019-04-21 03:39:07.387349,5.8746,17.193548,0.962351,21.903226,13147.387349,0.0


In [11]:
#Armo la ventana numero 3
fecha_minima = pd.to_datetime('2019-04-22 00:00:00')
fecha_tope = pd.to_datetime('2019-04-25 00:00:00')

auctions_ventana3 = auctions.loc[((auctions['date'] > fecha_minima) & (auctions['date'] < fecha_tope) )].copy(deep=False)

#Agrego el target
#la division es para pasar la diferencia de tiempso de timedelta a float de dias
auctions_ventana3['tiempo_hasta_aparicion'] = (auctions_ventana3['date']-fecha_minima)/ np.timedelta64(1, 's')

#Agrego algunos features
auctions_ventana3['std_hour'] = auctions_ventana3['date'].dt.hour
auctions_ventana3['mean_hour'] = auctions_ventana3['date'].dt.hour
auctions_ventana3['std_dia'] = auctions_ventana3['date'].dt.day
auctions_ventana3['mean_dia'] = auctions_ventana3['date'].dt.day

In [12]:
auctions_ventana3_reducido = auctions_ventana3.groupby(['device_id']).agg({'date' : min,\
                                                                         'std_hour' : lambda x: np.std(x),\
                                                                         'mean_hour' : 'mean',\
                                                                         'std_dia' : lambda x: np.std(x),\
                                                                         'mean_dia' : 'mean',\
                                                                         'tiempo_hasta_aparicion' : 'min',\
                                                                         'entradas_previas' : min})\
                                                                            .reset_index('device_id')
#entradas previas no se debe usar para predecir en esta ventana, sino para la siguiente              
auctions_ventana3_reducido=auctions_ventana3_reducido.dropna()
auctions_ventana3_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
0,100033926124811452,2019-04-23 03:10:50.587009,0.0,3.0,0.0,23.0,97850.587009,0.0
1,1000400432115255220,2019-04-23 06:13:31.087868,7.399024,11.846154,0.266469,23.076923,108811.087868,28.0
2,1001123163431776865,2019-04-22 00:17:35.199447,7.053178,8.621951,0.849817,22.902439,1055.199447,110.0
3,1001144380199556647,2019-04-22 00:02:49.574286,8.10767,15.492188,0.617929,22.65625,169.574286,139.0
4,1001358436431521709,2019-04-22 08:56:22.433293,6.839753,18.466667,0.61554,23.433333,32182.433293,32.0


In [13]:
#Armo la ventana numero 4
fecha_minima = pd.to_datetime('2019-04-23 00:00:00')
fecha_tope = pd.to_datetime('2019-04-26 00:00:00')

auctions_ventana4 = auctions.loc[((auctions['date'] > fecha_minima) & (auctions['date'] < fecha_tope) )].copy(deep=False)

#Agrego el target
#la division es para pasar la diferencia de tiempso de timedelta a float de dias
auctions_ventana4['tiempo_hasta_aparicion'] = (auctions_ventana4['date']-fecha_minima)/ np.timedelta64(1, 's')

#Agrego algunos features
auctions_ventana4['std_hour'] = auctions_ventana4['date'].dt.hour
auctions_ventana4['mean_hour'] = auctions_ventana4['date'].dt.hour
auctions_ventana4['std_dia'] = auctions_ventana4['date'].dt.day
auctions_ventana4['mean_dia'] = auctions_ventana4['date'].dt.day

In [14]:
auctions_ventana4_reducido = auctions_ventana4.groupby(['device_id']).agg({'date' : min,\
                                                                         'std_hour' : lambda x: np.std(x),\
                                                                         'mean_hour' : 'mean',\
                                                                         'std_dia' : lambda x: np.std(x),\
                                                                         'mean_dia' : 'mean',\
                                                                         'tiempo_hasta_aparicion' : 'min',\
                                                                         'entradas_previas' : min})\
                                                                            .reset_index('device_id')
#entradas previas no se debe usar para predecir en esta ventana, sino para la siguiente              
auctions_ventana4_reducido=auctions_ventana4_reducido.dropna()
auctions_ventana4_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
0,100033926124811452,2019-04-23 03:10:50.587009,0.0,3.0,0.0,23.0,11450.587009,0.0
1,1000400432115255220,2019-04-23 06:13:31.087868,6.722374,11.529412,0.848365,23.529412,22411.087868,28.0
2,1001123163431776865,2019-04-23 01:12:10.022592,5.387543,5.522388,0.780984,23.955224,4330.022592,144.0
3,1001144380199556647,2019-04-23 03:14:56.952125,5.01754,19.296703,0.789528,23.483516,11696.952125,193.0
4,1001358436431521709,2019-04-23 02:51:19.590342,9.312562,11.612613,0.813309,24.261261,10279.590342,36.0


In [15]:
#Armo la ventana numero 5
fecha_minima = pd.to_datetime('2019-04-24 00:00:00')
fecha_tope = pd.to_datetime('2019-04-27 00:00:00')

auctions_ventana5 = auctions.loc[((auctions['date'] > fecha_minima) & (auctions['date'] < fecha_tope) )].copy(deep=False)

#Agrego el target
#la division es para pasar la diferencia de tiempso de timedelta a float de dias
auctions_ventana5['tiempo_hasta_aparicion'] = (auctions_ventana5['date']-fecha_minima)/ np.timedelta64(1, 's')

#Agrego algunos features
auctions_ventana5['std_hour'] = auctions_ventana5['date'].dt.hour
auctions_ventana5['mean_hour'] = auctions_ventana5['date'].dt.hour
auctions_ventana5['std_dia'] = auctions_ventana5['date'].dt.day
auctions_ventana5['mean_dia'] = auctions_ventana5['date'].dt.day

In [16]:
auctions_ventana5_reducido = auctions_ventana5.groupby(['device_id']).agg({'date' : min,\
                                                                         'std_hour' : lambda x: np.std(x),\
                                                                         'mean_hour' : 'mean',\
                                                                         'std_dia' : lambda x: np.std(x),\
                                                                         'mean_dia' : 'mean',\
                                                                         'tiempo_hasta_aparicion' : 'min',\
                                                                         'entradas_previas' : min})\
                                                                            .reset_index('device_id')
#entradas previas no se debe usar para predecir en esta ventana, sino para la siguiente              
auctions_ventana5_reducido=auctions_ventana5_reducido.dropna()
auctions_ventana5_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
1,1000400432115255220,2019-04-24 22:18:41.231385,5.6,12.8,0.4,24.8,80321.231385,40.0
2,1001123163431776865,2019-04-24 01:34:20.785346,5.023255,5.323944,0.8558,25.0,5660.785346,166.0
3,1001144380199556647,2019-04-24 02:52:27.414060,3.573084,16.9375,0.77027,25.229167,10347.41406,257.0
4,1001358436431521709,2019-04-24 01:08:48.217577,9.105351,11.378151,0.732588,25.033613,4128.217577,62.0
5,1001650136929210538,2019-04-24 00:22:27.574516,8.269658,11.987402,0.834854,25.047244,1347.574516,2138.0


In [17]:
auctions_ventana2_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
0,100033926124811452,2019-04-23 03:10:50.587009,0.0,3.0,0.0,23.0,184250.587009,0.0
1,1000400432115255220,2019-04-21 02:41:03.200096,6.9052,18.102564,0.923077,21.615385,9663.200096,1.0
2,1001123163431776865,2019-04-21 01:41:47.707241,7.428712,10.653061,0.781928,21.795918,6107.707241,68.0
3,1001144380199556647,2019-04-21 15:12:55.733409,7.204915,16.72067,0.83549,22.01676,54775.733409,78.0
4,1001358436431521709,2019-04-21 03:39:07.387349,5.8746,17.193548,0.962351,21.903226,13147.387349,0.0


In [18]:
competencia = pd.read_csv("data/target_competencia_ids.csv", dtype={'ref_hash': 'category'}) 

In [19]:
auctions_ventana5_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
1,1000400432115255220,2019-04-24 22:18:41.231385,5.6,12.8,0.4,24.8,80321.231385,40.0
2,1001123163431776865,2019-04-24 01:34:20.785346,5.023255,5.323944,0.8558,25.0,5660.785346,166.0
3,1001144380199556647,2019-04-24 02:52:27.414060,3.573084,16.9375,0.77027,25.229167,10347.41406,257.0
4,1001358436431521709,2019-04-24 01:08:48.217577,9.105351,11.378151,0.732588,25.033613,4128.217577,62.0
5,1001650136929210538,2019-04-24 00:22:27.574516,8.269658,11.987402,0.834854,25.047244,1347.574516,2138.0


In [20]:
competencia[['ref_hash','prediccion']] = competencia['ref_hash'].str.split('_',expand=True)
competencia.head()

Unnamed: 0,ref_hash,obj,prediccion
0,1000169251625791246,0,sc
1,1000169251625791246,0,st
2,1000395625957344683,0,sc
3,1000395625957344683,0,st
4,1003027494996471685,0,sc


In [21]:
competencia = competencia.loc[competencia['prediccion'] =='sc']

In [22]:
competencia.head()

Unnamed: 0,ref_hash,obj,prediccion
0,1000169251625791246,0,sc
2,1000395625957344683,0,sc
4,1003027494996471685,0,sc
6,1006670001679961544,0,sc
8,1007573308966476713,0,sc


In [23]:
auctions_ventana5_reducido = auctions_ventana5_reducido.merge(competencia,how='right',left_on='device_id',right_on='ref_hash')

In [24]:
auctions_ventana5_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas,ref_hash,obj,prediccion
0,1026825193119998092,2019-04-24 06:20:42.709101,0.0,6.0,0.0,24.0,22842.709101,4.0,1026825193119998092,0,sc
1,1032938399279723623,2019-04-24 02:58:32.169083,2.650143,4.0,0.694567,25.511628,10712.169083,113.0,1032938399279723623,0,sc
2,1071701264087131067,2019-04-24 00:00:23.704470,9.993208,7.443478,0.870608,25.182609,23.70447,107.0,1071701264087131067,0,sc
3,1083324672128707147,2019-04-24 04:52:51.179495,7.13115,12.681818,0.65081,24.409091,17571.179495,50.0,1083324672128707147,0,sc
4,109389620877102250,2019-04-24 04:28:55.803587,7.320574,12.129482,0.528009,24.707171,16135.803587,373.0,109389620877102250,0,sc


In [25]:
auctions_ventana5_reducido['tiempo_hasta_aparicion'] = auctions_ventana5_reducido['tiempo_hasta_aparicion'].fillna(259200)#3 dias en segundos

In [26]:
auctions_ventana5_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas,ref_hash,obj,prediccion
0,1026825193119998092,2019-04-24 06:20:42.709101,0.0,6.0,0.0,24.0,22842.709101,4.0,1026825193119998092,0,sc
1,1032938399279723623,2019-04-24 02:58:32.169083,2.650143,4.0,0.694567,25.511628,10712.169083,113.0,1032938399279723623,0,sc
2,1071701264087131067,2019-04-24 00:00:23.704470,9.993208,7.443478,0.870608,25.182609,23.70447,107.0,1071701264087131067,0,sc
3,1083324672128707147,2019-04-24 04:52:51.179495,7.13115,12.681818,0.65081,24.409091,17571.179495,50.0,1083324672128707147,0,sc
4,109389620877102250,2019-04-24 04:28:55.803587,7.320574,12.129482,0.528009,24.707171,16135.803587,373.0,109389620877102250,0,sc


In [27]:
auctions_ventana5_reducido = auctions_ventana5_reducido.drop(['date','obj','prediccion'],axis=1)

In [28]:
auctions_ventana5_reducido['device_id'] = auctions_ventana5_reducido['device_id'].fillna(auctions_ventana5_reducido['ref_hash'])

In [29]:
auctions_ventana5_reducido = auctions_ventana5_reducido.set_index('device_id')

In [30]:
#target

auctions_ventana5_reducido = auctions_ventana5_reducido.drop(['ref_hash'],axis=1)
auctions_ventana5_reducido.head()

Unnamed: 0_level_0,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1026825193119998092,0.0,6.0,0.0,24.0,22842.709101,4.0
1032938399279723623,2.650143,4.0,0.694567,25.511628,10712.169083,113.0
1071701264087131067,9.993208,7.443478,0.870608,25.182609,23.70447,107.0
1083324672128707147,7.13115,12.681818,0.65081,24.409091,17571.179495,50.0
109389620877102250,7.320574,12.129482,0.528009,24.707171,16135.803587,373.0


In [31]:

auctions_ventana2_reducido.head()

Unnamed: 0,device_id,date,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
0,100033926124811452,2019-04-23 03:10:50.587009,0.0,3.0,0.0,23.0,184250.587009,0.0
1,1000400432115255220,2019-04-21 02:41:03.200096,6.9052,18.102564,0.923077,21.615385,9663.200096,1.0
2,1001123163431776865,2019-04-21 01:41:47.707241,7.428712,10.653061,0.781928,21.795918,6107.707241,68.0
3,1001144380199556647,2019-04-21 15:12:55.733409,7.204915,16.72067,0.83549,22.01676,54775.733409,78.0
4,1001358436431521709,2019-04-21 03:39:07.387349,5.8746,17.193548,0.962351,21.903226,13147.387349,0.0


In [32]:
auctions_ventana2_reducido = auctions_ventana2_reducido.drop(['date'],axis=1)
auctions_ventana2_reducido.head()

Unnamed: 0,device_id,std_hour,mean_hour,std_dia,mean_dia,tiempo_hasta_aparicion,entradas_previas
0,100033926124811452,0.0,3.0,0.0,23.0,184250.587009,0.0
1,1000400432115255220,6.9052,18.102564,0.923077,21.615385,9663.200096,1.0
2,1001123163431776865,7.428712,10.653061,0.781928,21.795918,6107.707241,68.0
3,1001144380199556647,7.204915,16.72067,0.83549,22.01676,54775.733409,78.0
4,1001358436431521709,5.8746,17.193548,0.962351,21.903226,13147.387349,0.0


In [33]:
#PARA TRAINEAR

columnsTitles = ['device_id','std_hour','mean_hour','std_dia','mean_dia','entradas_previas','tiempo_hasta_aparicion']
auctions_ventana2_reducido = auctions_ventana2_reducido.reindex(columns=columnsTitles)
auctions_ventana2_reducido = auctions_ventana2_reducido.set_index('device_id')
train_features = auctions_ventana2_reducido.drop(['tiempo_hasta_aparicion'],axis=1)
train_features.reset_index().to_csv('xgb/train_features_prediccion_auctions.csv')
train_labels = auctions_ventana2_reducido.drop(['entradas_previas','mean_hour','std_hour','std_dia','mean_dia'],axis=1)
train_labels.reset_index().to_csv('xgb/train_labels.csv')
print(train_features.head())
print(train_labels.head())
auctions_ventana2_reducido.head()

                     std_hour  mean_hour   std_dia   mean_dia  \
device_id                                                       
100033926124811452   0.000000   3.000000  0.000000  23.000000   
1000400432115255220  6.905200  18.102564  0.923077  21.615385   
1001123163431776865  7.428712  10.653061  0.781928  21.795918   
1001144380199556647  7.204915  16.720670  0.835490  22.016760   
1001358436431521709  5.874600  17.193548  0.962351  21.903226   

                     entradas_previas  
device_id                              
100033926124811452                0.0  
1000400432115255220               1.0  
1001123163431776865              68.0  
1001144380199556647              78.0  
1001358436431521709               0.0  
                     tiempo_hasta_aparicion
device_id                                  
100033926124811452            184250.587009
1000400432115255220             9663.200096
1001123163431776865             6107.707241
1001144380199556647            54775.733409


Unnamed: 0_level_0,std_hour,mean_hour,std_dia,mean_dia,entradas_previas,tiempo_hasta_aparicion
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100033926124811452,0.0,3.0,0.0,23.0,0.0,184250.587009
1000400432115255220,6.9052,18.102564,0.923077,21.615385,1.0,9663.200096
1001123163431776865,7.428712,10.653061,0.781928,21.795918,68.0,6107.707241
1001144380199556647,7.204915,16.72067,0.83549,22.01676,78.0,54775.733409
1001358436431521709,5.8746,17.193548,0.962351,21.903226,0.0,13147.387349


In [34]:
#PARA TESTEAR

columnsTitles = ['device_id','std_hour','mean_hour','std_dia','mean_dia','entradas_previas','tiempo_hasta_aparicion']
auctions_ventana5_reducido = auctions_ventana5_reducido.reindex(columns=columnsTitles)
test_features = auctions_ventana2_reducido.drop(['tiempo_hasta_aparicion'],axis=1)
test_features.reset_index().to_csv('xgb/test_features_prediccion_auctions.csv')
test_labels = auctions_ventana2_reducido.drop(['entradas_previas','mean_hour','std_hour','std_dia','mean_dia'],axis=1)
test_labels.reset_index().to_csv('xgb/test_labels.csv')
print(test_features.head())
print(test_labels.head())
auctions_ventana5_reducido.head()

                     std_hour  mean_hour   std_dia   mean_dia  \
device_id                                                       
100033926124811452   0.000000   3.000000  0.000000  23.000000   
1000400432115255220  6.905200  18.102564  0.923077  21.615385   
1001123163431776865  7.428712  10.653061  0.781928  21.795918   
1001144380199556647  7.204915  16.720670  0.835490  22.016760   
1001358436431521709  5.874600  17.193548  0.962351  21.903226   

                     entradas_previas  
device_id                              
100033926124811452                0.0  
1000400432115255220               1.0  
1001123163431776865              68.0  
1001144380199556647              78.0  
1001358436431521709               0.0  
                     tiempo_hasta_aparicion
device_id                                  
100033926124811452            184250.587009
1000400432115255220             9663.200096
1001123163431776865             6107.707241
1001144380199556647            54775.733409


Unnamed: 0_level_0,device_id,std_hour,mean_hour,std_dia,mean_dia,entradas_previas,tiempo_hasta_aparicion
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1026825193119998092,,0.0,6.0,0.0,24.0,4.0,22842.709101
1032938399279723623,,2.650143,4.0,0.694567,25.511628,113.0,10712.169083
1071701264087131067,,9.993208,7.443478,0.870608,25.182609,107.0,23.70447
1083324672128707147,,7.13115,12.681818,0.65081,24.409091,50.0,17571.179495
109389620877102250,,7.320574,12.129482,0.528009,24.707171,373.0,16135.803587


In [35]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [36]:
xg_reg.fit(train_features,train_labels)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [37]:
predicted_labels = xg_reg.predict(test_features)

In [38]:
rmse = np.sqrt(mean_squared_error(test_labels, predicted_labels))
print("RMSE: %f" % (rmse))

RMSE: 63975.950418
