# Analisis de superficial del csv de events
### En este notebook exploramos los registros del csv con el objetivo de buscar relaciones entre las distintas variables

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Leemos el csv, y estalbecemos el tipo de dato de cada columna

In [2]:
events = pd.read_csv("../data/events.csv", dtype={'date': 'str', 'event_id': np.int16, 'ref_type': 'category',
                                               'ref_hash': 'int64', 'application_id':np.int16, 'attributed': 'bool',
                                               'device_countrycode': 'int64', 'device_os_version':'float64', 'device_brand': 'float64',
                                               'device_model': 'float64', 'device_city':'float64', 'session_user_agent': 'float64',
                                               'trans_id': 'str', 'user_agent':'float64', 'event_uuid': 'str',
                                               'carrier': 'float64', 'kind':'float64', 'device_os': 'category',
                                               'wifi': np.float16, 'connection_type':'str', 'ip_address': 'int64',
                                               'device_language': 'float64'}, parse_dates = ['date'])
events['day'] = events['date'].dt.date
events['hour'] = events['date'].dt.hour
events.head(5)

Unnamed: 0,index,date,event_id,ref_type,ref_hash,application_id,attributed,device_countrycode,device_os_version,device_brand,...,event_uuid,carrier,kind,device_os,wifi,connection_type,ip_address,device_language,day,hour
0,2130678,2019-04-20 01:42:49.120,0,1891515180541284343,5857744372586891366,210,False,6287817205707153877,,,...,5b506964-5f47-4b28-a8c2-8a92d6c23379,,5.882882e+18,,0.0,,7544543351571901618,3.301378e+18,2019-04-20,1
1,2130680,2019-04-20 01:42:49.340,1,1891515180541284343,7642521036780133571,210,False,6287817205707153877,,,...,f1fb9d15-1a7b-4116-8d3b-c4c403e197e2,,4.017674e+18,,0.0,,6949523255335024165,,2019-04-20,1
2,2130681,2019-04-20 01:42:49.365,1,1891515180541284343,2548841562898283198,210,False,6287817205707153877,,,...,c85a0b15-a5d7-472e-8116-6bfa3db19687,,4.017674e+18,,0.0,,6428537280982666957,,2019-04-20,1
3,2130684,2019-04-20 01:42:51.438,2,1891515180541284343,609402887625919085,210,False,6287817205707153877,,,...,f4aa0a97-2de6-4f22-95c6-1b3150112cb9,,6.168309e+18,,0.0,,7607371352198017145,,2019-04-20,1
4,2130688,2019-04-20 01:42:51.838,1,1891515180541284343,9114651763556439823,210,False,6287817205707153877,,,...,08e2f7f7-875f-4aa0-b337-b9b87b0d83ea,,4.017674e+18,,0.0,,2901772839007473756,,2019-04-20,1


In [3]:
events['date'].max()

Timestamp('2019-04-26 23:59:59.881000')

In [4]:
events['date'].min()

Timestamp('2019-04-18 00:00:00.027000')

## Revisamos el tipo de cada columna

In [5]:
events.dtypes

index                          int64
date                  datetime64[ns]
event_id                       int16
ref_type                    category
ref_hash                       int64
application_id                 int16
attributed                      bool
device_countrycode             int64
device_os_version            float64
device_brand                 float64
device_model                 float64
device_city                  float64
session_user_agent           float64
trans_id                      object
user_agent                   float64
event_uuid                    object
carrier                      float64
kind                         float64
device_os                   category
wifi                         float16
connection_type               object
ip_address                     int64
device_language              float64
day                           object
hour                           int64
dtype: object

## Elegimos las columnas con menor entropia o pocos valores diferentes y nos quedamos con los 5 valores mas comunes

In [6]:
grupos = ['event_id', 'ref_type','application_id', 'device_os', 'wifi', 'connection_type', 'hour', 'day', 'attributed']
top5 = {}
for i in grupos: 
    top5[i]= events[i].value_counts().head(5).index.tolist()
top5

{'application_id': [210, 122, 121, 65, 26],
 'attributed': [False, True],
 'connection_type': ['Cable/DSL', 'Cellular', 'Corporate', 'Dialup'],
 'day': [datetime.date(2019, 4, 26),
  datetime.date(2019, 4, 25),
  datetime.date(2019, 4, 24),
  datetime.date(2019, 4, 23),
  datetime.date(2019, 4, 22)],
 'device_os': ['7.531669329342818e+18', '6.941824626260379e+18'],
 'event_id': [1, 15, 23, 2, 115],
 'hour': [0, 2, 1, 23, 22],
 'ref_type': ['1891515180541284343', '1494519392962156891'],
 'wifi': [0.0, 1.0]}

In [7]:
events = events.sort_values(by = 'date')
events = events.sort_values(by = 'ref_hash')
events['repeticiones'] = 1
device_ids = events['ref_hash'].to_frame().set_index('ref_hash')
#events = pd.get_dummies(events, columns = grupos)
for column in grupos:
    for value in top5[column]:
        events[column + str(value)] = (events[column] == value)*1

In [8]:
fecha_minima=pd.to_datetime('2019-04-18 00:00:00')
fecha_minima
fecha_tope=pd.to_datetime('2019-04-21 00:00:00')
events_ventana1=events.loc[((events['date']>fecha_minima) & (events['date']<fecha_tope) )].copy(deep=False)

In [9]:
fecha_minima=pd.to_datetime('2019-04-19 00:00:00')
fecha_minima
fecha_tope=pd.to_datetime('2019-04-22 00:00:00')
events_ventana2=events.loc[((events['date']>fecha_minima) & (events['date']<fecha_tope) )].copy(deep=False)

In [10]:
fecha_minima=pd.to_datetime('2019-04-20 00:00:00')
fecha_minima
fecha_tope=pd.to_datetime('2019-04-23 00:00:00')
events_ventana3=events.loc[((events['date']>fecha_minima) & (events['date']<fecha_tope) )].copy(deep=False)

In [11]:
fecha_minima=pd.to_datetime('2019-04-21 00:00:00')
fecha_minima
fecha_tope=pd.to_datetime('2019-04-24 00:00:00')
events_ventana4=events.loc[((events['date']>fecha_minima) & (events['date']<fecha_tope) )].copy(deep=False)

In [12]:
fecha_minima=pd.to_datetime('2019-04-24 00:00:00')
fecha_minima
fecha_tope=pd.to_datetime('2019-04-27 00:00:00')
events_ventana7=events.loc[((events['date']>fecha_minima) & (events['date']<fecha_tope) )].copy(deep=False)

In [13]:
features = ['sum', 'mean', 'std', 'min', 'max']
ventanas = [events_ventana1, events_ventana2, events_ventana3, events_ventana4, events_ventana7]
numero_de_ventanas = [1,2,3,4,7]
contador = 0
for ventana in ventanas:
    contador = contador + 1
    features_ventana = ventana.groupby('ref_hash').agg(features).fillna(0)
    level0 = features_ventana.columns.get_level_values(0)
    level1 = features_ventana.columns.get_level_values(1)
    features_ventana.columns = level0 + "_" + level1
    features_ventana.to_csv('../xgb/features_events_gonzalo_ventana' + str(numero_de_ventanas[contador]) + '.csv')