# Analisis de superficial del csv de auctions
### En este notebook exploramos los registros del csv con el objetivo de buscar relaciones entre las distintas variables

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Leemos el csv, y estalbecemos el tipo de dato de cada columna

In [2]:
auctions = pd.read_csv("../data/auctions.csv", dtype={'date': 'str','device_id': 'category', 'ref_type_id': 'category','source_id':'category'}, parse_dates = ['date'])
auctions['day'] = auctions['date'].dt.date
auctions['hour'] = auctions['date'].dt.hour
auctions.rename(columns = {"device_id": "ref_hash"}, inplace = True)
auctions.head(5)

Unnamed: 0,date,ref_hash,ref_type_id,source_id,day,hour
0,2019-04-23 18:58:00.842116,2564673204772915246,1,0,2019-04-23,18
1,2019-04-23 18:58:01.530771,4441121667607578179,7,0,2019-04-23,18
2,2019-04-23 18:58:01.767562,7721769811471055264,1,0,2019-04-23,18
3,2019-04-23 18:58:02.363468,6416039086842158968,1,0,2019-04-23,18
4,2019-04-23 18:58:02.397559,1258642015983312729,1,0,2019-04-23,18


## Elegimos las columnas con menor entropia o pocos valores diferentes y nos quedamos con los 5 valores mas comunes

In [3]:
grupos = ['ref_type_id', 'source_id', 'hour', 'day']
top5 = {}
for i in grupos: 
    top5[i]= auctions[i].value_counts().head(5).index.tolist()
top5

{'day': [datetime.date(2019, 4, 22),
  datetime.date(2019, 4, 18),
  datetime.date(2019, 4, 23),
  datetime.date(2019, 4, 24),
  datetime.date(2019, 4, 25)],
 'hour': [3, 4, 2, 1, 23],
 'ref_type_id': ['1', '7'],
 'source_id': ['1', '0', '3', '7', '6']}

## Solo correr uno de los siguientes a la vez, es muy grande, y el codigo esta diseniado para solo realizar uno

In [4]:
fecha_minima=pd.to_datetime('2019-04-24 00:00:00')
fecha_minima
fecha_tope=pd.to_datetime('2019-04-27 00:00:00')
auctions=auctions.loc[((auctions['date']>fecha_minima) & (auctions['date']<fecha_tope) )]
numero_de_ventana = 7

## Esto si se corre siempre

In [5]:
#auctions = auctions.sort_values(by = 'date')
#auctions = auctions.sort_values(by = 'ref_hash')
auctions['repeticiones'] = 1
device_ids = auctions['ref_hash'].to_frame().set_index('ref_hash')
#auctions = pd.get_dummies(auctions, columns = grupos)
for column in grupos:
    for value in top5[column]:
        auctions[column + str(value)] = (auctions[column] == value)*1

In [6]:
drop_columns_sc = [
    'hour_sum',
    'hour_mean',
    'hour_std',
    'hour_min',
    'hour_max',
    'repeticiones_sum',
    'repeticiones_mean',
    'repeticiones_std',
    'repeticiones_min',
    'repeticiones_max',
    'ref_type_id1_sum',
    'ref_type_id1_mean',
    'ref_type_id1_std',
    'ref_type_id1_min',
    'ref_type_id1_max',
    'ref_type_id7_sum',
    'ref_type_id7_mean',
    'ref_type_id7_std',
    'ref_type_id7_min',
    'ref_type_id7_max',
    'source_id1_sum',
    'source_id1_mean',
    'source_id1_std',
    'source_id1_min',
    'source_id1_max',
    'source_id0_sum',
    'source_id0_mean',
    'source_id0_std',
    'source_id0_min',
    'source_id0_max',
    'source_id3_sum',
    'source_id3_mean',
    'source_id3_std',
    'source_id3_min',
    'source_id3_max',
    'source_id7_sum',
    'source_id7_mean',
    'source_id7_std',
    'source_id7_min',
    'source_id7_max',
    'source_id6_sum',
    'source_id6_mean',
    'source_id6_std',
    'source_id6_min',
    'source_id6_max',
    'hour3_sum',
    'hour3_mean',
    'hour3_std',
    'hour3_min',
    'hour3_max',
    'hour4_sum',
    'hour4_mean',
    'hour4_std',
    'hour4_min',
    'hour4_max',
    'hour2_sum',
    'hour2_mean',
    'hour2_std',
    'hour2_min',
    'hour2_max',
    'hour1_sum',
    'hour1_mean',
    'hour1_std',
    'hour1_min',
    'hour1_max',
    'hour23_sum',
    'hour23_std',
    'hour23_min',
    'hour23_max',
    'day2019-04-22_sum',
    'day2019-04-22_mean',
    'day2019-04-22_std',
    'day2019-04-22_min',
    'day2019-04-22_max',
    'day2019-04-18_sum',
    'day2019-04-18_std',
    'day2019-04-18_max',
    'day2019-04-23_sum',
    'day2019-04-23_mean',
    'day2019-04-23_std',
    'day2019-04-23_min',
    'day2019-04-23_max',
    'day2019-04-24_sum',
    'day2019-04-24_mean',
    'day2019-04-24_std',
    'day2019-04-24_min',
    'day2019-04-24_max',
    'day2019-04-25_sum',
    'day2019-04-25_mean',
    'day2019-04-25_std',
    'day2019-04-25_min',
    'day2019-04-25_max']

In [7]:
drop_columns_st = [
    'repeticiones_std',
    'ref_type_id1_mean',
    'ref_type_id1_std',
    'ref_type_id1_min',
    'ref_type_id7_sum',
    'ref_type_id7_std',
    'ref_type_id7_min',
    'ref_type_id7_max',
    'source_id1_sum',
    'source_id1_mean',
    'source_id1_min',
    'source_id1_max',
    'source_id0_sum',
    'source_id0_mean',
    'source_id0_std',
    'source_id0_min',
    'source_id0_max',
    'source_id3_sum',
    'source_id3_mean',
    'source_id3_std',
    'source_id3_min',
    'source_id3_max',
    'source_id7_sum',
    'source_id7_mean',
    'source_id7_std',
    'source_id7_min',
    'source_id7_max',
    'source_id6_sum',
    'source_id6_mean',
    'source_id6_std',
    'source_id6_max',
    'hour3_sum',
    'hour3_std',
    'hour3_min',
    'hour3_max',
    'hour4_mean',
    'hour2_sum',
    'hour2_mean',
    'hour2_std',
    'hour2_min',
    'hour1_sum',
    'hour1_mean',
    'hour1_std',
    'hour1_min',
    'hour23_min',
    'day2019-04-22_sum',
    'day2019-04-22_mean',
    'day2019-04-22_std',
    'day2019-04-22_min',
    'day2019-04-22_max',
    'day2019-04-18_max',
    'day2019-04-23_sum',
    'day2019-04-23_mean',
    'day2019-04-23_std',
    'day2019-04-23_min',
    'day2019-04-23_max',
    'day2019-04-24_sum',
    'day2019-04-24_mean',
    'day2019-04-24_std',
    'day2019-04-24_min',
    'day2019-04-24_max',
    'day2019-04-25_sum',
    'day2019-04-25_mean',
    'day2019-04-25_std',
    'day2019-04-25_min',
    'day2019-04-25_max']

In [8]:
features = ['sum', 'mean', 'std', 'min', 'max']

features_ventana = auctions.groupby('ref_hash').agg(features).fillna(0)
level0 = features_ventana.columns.get_level_values(0)
level1 = features_ventana.columns.get_level_values(1)
features_ventana.columns = level0 + "_" + level1
features_ventana = features_ventana.loc[features_ventana['repeticiones_sum'] > 0]
features_st = features_ventana.drop(drop_columns_st, axis=1, inplace = False)
features_st.to_csv('../xgb/features_auctions_gonzalo_st_ventana' + str(numero_de_ventana) + '.csv')
features_sc = features_ventana.drop(drop_columns_sc, axis=1, inplace = False)
features_sc.to_csv('../xgb/features_auctions_gonzalo_sc_ventana' + str(numero_de_ventana) + '.csv')