# Analisis de superficial del csv de auctions
### En este notebook exploramos los registros del csv con el objetivo de buscar relaciones entre las distintas variables

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Leemos el csv, y estalbecemos el tipo de dato de cada columna

In [2]:
auctions = pd.read_csv("../data/auctions.csv", dtype={'date': 'str','device_id': 'category', 'ref_type_id': 'category','source_id':'category'}, parse_dates = ['date'])
auctions.head(5)

Unnamed: 0,date,device_id,ref_type_id,source_id
0,2019-04-23 18:58:00.842116,2564673204772915246,1,0
1,2019-04-23 18:58:01.530771,4441121667607578179,7,0
2,2019-04-23 18:58:01.767562,7721769811471055264,1,0
3,2019-04-23 18:58:02.363468,6416039086842158968,1,0
4,2019-04-23 18:58:02.397559,1258642015983312729,1,0


In [3]:
auctions['day'] = auctions['date'].dt.date
auctions['hour'] = auctions['date'].dt.hour*3600+auctions['date'].dt.minute*60+auctions['date'].dt.second
auctions['tiempo_desde_inicio_de_ventana']=auctions['date']
auctions.rename(columns = {"device_id": "ref_hash"}, inplace = True)

## Elegimos las columnas con menor entropia o pocos valores diferentes y nos quedamos con los 5 valores mas comunes

In [4]:
#grupos = ['ref_type_id', 'source_id', 'hour', 'day']
#top5 = {}
#for i in grupos: 
#    top5[i]= auctions[i].value_counts().head(5).index.tolist()
#top5
auctions=auctions.drop(['ref_type_id','source_id'],axis=1)

## Solo correr uno de los siguientes a la vez, es muy grande, y el codigo esta diseniado para solo realizar uno

In [8]:
fecha_minima=pd.to_datetime('2019-04-24 00:00:00')
fecha_minima
fecha_tope=pd.to_datetime('2019-04-27 00:00:00')
auctions_ventana=auctions.loc[((auctions['date']>fecha_minima) & (auctions['date']<fecha_tope) )].copy()
auctions_ventana['tiempo_desde_inicio_de_ventana']=(auctions_ventana['tiempo_desde_inicio_de_ventana']-fecha_minima)/ np.timedelta64(1, 's')
numero_de_ventana = 7

In [9]:
#auctions = auctions.sort_values(by = 'date')
#auctions = auctions.sort_values(by = 'ref_hash')
auctions_ventana['repeticiones'] = 1
device_ids = auctions_ventana['ref_hash'].to_frame().set_index('ref_hash')
#auctions = pd.get_dummies(auctions, columns = grupos)
#for column in grupos:
#    for value in top5[column]:
#        auctions[column + str(value)] = (auctions[column] == value)*1

## Esto si se corre siempre

In [10]:
features = ['sum', 'mean', 'std', 'min', 'max']

features_ventana = auctions_ventana.groupby('ref_hash').agg(features).fillna(0)
level0 = features_ventana.columns.get_level_values(0)
level1 = features_ventana.columns.get_level_values(1)
features_ventana.columns = level0 + "_" + level1
features_ventana = features_ventana.loc[features_ventana['repeticiones_sum'] > 0]
features_ventana.to_csv('../xgb/features_auctions_lucas_ventana' + str(numero_de_ventana) + '.csv')
features_ventana.head()

Unnamed: 0_level_0,hour_sum,hour_mean,hour_std,hour_min,hour_max,tiempo_desde_inicio_de_ventana_sum,tiempo_desde_inicio_de_ventana_mean,tiempo_desde_inicio_de_ventana_std,tiempo_desde_inicio_de_ventana_min,tiempo_desde_inicio_de_ventana_max,repeticiones_sum,repeticiones_mean,repeticiones_std,repeticiones_min,repeticiones_max
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1000400432115255220,242173,48434.6,22387.255243,23966.0,80321.0,587775.9,117555.184124,24833.274298,80321.231385,140229.094239,5,1.0,0.0,1.0,1.0
1001123163431776865,1475353,20779.619718,18147.67075,3754.0,60816.0,7609789.0,107180.122329,75388.991698,5660.785346,218900.053176,71,1.0,0.0,1.0,1.0
1001144380199556647,3024623,63012.979167,12879.951484,10347.0,80833.0,8122249.0,169213.516468,71041.513542,10347.41406,251430.589605,48,1.0,0.0,1.0,1.0
1001358436431521709,5082546,42710.470588,33086.182284,3.0,86142.0,15709800.0,132015.153008,66451.390274,4128.217577,252718.82138,119,1.0,0.0,1.0,1.0
1001650136929210538,28596702,45034.176378,29934.03209,30.0,86292.0,86053030.0,135516.57646,67668.524343,1347.574516,255028.872025,635,1.0,0.0,1.0,1.0
