# Imports

In [1]:
import pandas as pd
import numpy as np

---

In [16]:
# Columnas a cargar
columnas_cargar = ["date", "event_id","ref_type","ref_hash","application_id","attributed", "session_user_agent", "kind", "wifi"]
# Definicion de tipos
tipos = {"event_id" : "int16",
         "ref_hash" : "int64",
         "application_id" : "int16",
         "attributed": "bool",
         "wifi":"bool"}

events = pd.read_csv("../../../../data/tp2/events.csv", usecols=columnas_cargar, dtype=tipos, parse_dates=["date"])


### Formateo el sistema operativo

In [17]:
events['os'] = 1 * ((events["ref_type"] == 1891515180541284343).astype('int8'))

In [13]:
events.drop(columns = {"ref_type"},inplace = True)

In [18]:
events["os"].value_counts()

1    6421584
0    1322997
Name: os, dtype: int64

### 1 ---> ANDROID 
### 0 ---> IOS

In [42]:
events.sort_values(by = "date",inplace = True)

In [62]:
events["dayofweek"] = (events["date"].dt.dayofweek).astype("int8")

In [26]:
def armar_ventanas(dataset,col_fecha):
    """Recibe el dataset y una string indicando el 
    nombre de la columna que lleva la fecha"""
    
    ventanas = []
    for x in range(5):
        ventanas.append((dataset.loc[((18+x) <= dataset[col_fecha].dt.day) & (dataset[col_fecha].dt.day <= (20+x))]))
    return ventanas

In [70]:
def target_ventanas(ventanas):
    entrenamientos = []
    for ventana in ventanas:
        ent = ventana["ref_hash"].drop_duplicates().to_frame()
        ent.rename(columns ={"ref_hash":"device_id"},inplace = True)
        ent.set_index("device_id", inplace=True)
        entrenamientos.append(ent)
    return entrenamientos

In [63]:
ventanas = armar_ventanas(events,"date")

In [71]:
targets = target_ventanas(ventanas)

## Comienzo a trabajar con la primer ventana

In [64]:
ventana1 = ventanas[0]

In [105]:
entrenamiento1 = targets[0]

In [86]:
ventana1.head()

Unnamed: 0,date,event_id,ref_type,ref_hash,application_id,attributed,session_user_agent,kind,wifi,os,dayofweek
170092,2019-04-18 00:00:00.027,13,1891515180541284343,4314242242878368557,65,False,3.819516e+18,4.853173e+18,True,1,3
947821,2019-04-18 00:00:00.071,158,1494519392962156891,2068670382964280534,230,False,3.819516e+18,8.370513e+18,True,0,3
1724915,2019-04-18 00:00:00.102,15,1891515180541284343,726218951290512705,65,False,3.819516e+18,5.500848e+18,True,1,3
2794789,2019-04-18 00:00:00.309,0,1891515180541284343,1329210878469536477,263,False,3.819516e+18,5.882882e+18,False,1,3
5526010,2019-04-18 00:00:00.486,2,1891515180541284343,5661732594213028190,210,False,3.819516e+18,6.168309e+18,False,1,3


## Promedio de eventos por día

In [100]:
feature = ventana1.groupby(["ref_hash","dayofweek"]).size()\
                  .groupby("ref_hash").mean()\
                  .to_frame().rename(columns = {0:"promedio_eventos"})

In [104]:
feature["promedio_eventos"] = feature["promedio_eventos"].astype("int64")

In [108]:
entrenamiento1 = entrenamiento1.merge(feature, how = 'left',left_index = True,right_index = True)

## Evento más frecuente

In [149]:
feature = ventana1.groupby(["ref_hash","event_id"]).size().to_frame()
                  

In [158]:
idx = feature.groupby(['ref_hash'])[0].transform(max) == feature[0]
feature = feature[idx]

### ¿Hay dispositivos que tengan más de un evento como máximo ?

In [201]:
((feature.groupby(["ref_hash"]).count()[0]) > 1).sum()

27801

### ¿Cual es el máximo numero de eventos empatados como máximo por dispositivo?

In [196]:
(feature.groupby(["ref_hash"]).count()[0]).max()

10

### ¿Qué criterio se toma para decidir con cual evento quedarse? 

In [167]:
idx = feature.groupby(['ref_hash'])[0].transform(max) == feature[0]

(feature.groupby(["ref_hash"]).count() > 1).sum()

0    27801
dtype: int64

### Voy a crear una lista ordenada de los eventos más presentes por dispositivo, contando un solo dispositivo por evento para no desbalancear si un dispositivo hace 10000 el mismo evento.
### Esto podría dar un orden de importancia entre eventos

In [184]:
top_eventos = ventana1.groupby(["ref_hash","event_id"]).size().reset_index()\
                      .groupby("event_id").size().sort_values(ascending = False)

In [185]:
top_eventos.head()

event_id
23    63062
15    45078
7     32720
0     25747
2     20475
dtype: int64

In [189]:
top_eventos.index.to_list()

[23,
 15,
 7,
 0,
 2,
 1,
 106,
 101,
 13,
 100,
 245,
 3,
 104,
 14,
 287,
 118,
 39,
 28,
 24,
 120,
 246,
 108,
 115,
 16,
 116,
 105,
 18,
 27,
 288,
 121,
 17,
 363,
 109,
 157,
 26,
 296,
 25,
 110,
 342,
 249,
 19,
 248,
 216,
 341,
 162,
 4,
 247,
 102,
 211,
 103,
 345,
 117,
 119,
 123,
 452,
 364,
 159,
 286,
 163,
 454,
 160,
 122,
 12,
 381,
 158,
 172,
 173,
 453,
 161,
 155,
 64,
 177,
 289,
 124,
 198,
 432,
 171,
 48,
 486,
 129,
 255,
 455,
 218,
 212,
 77,
 184,
 521,
 114,
 346,
 30,
 187,
 217,
 213,
 380,
 180,
 319,
 74,
 66,
 185,
 329,
 164,
 181,
 175,
 431,
 70,
 22,
 67,
 174,
 186,
 555,
 65,
 365,
 21,
 176,
 73,
 63,
 53,
 518,
 126,
 290,
 199,
 76,
 516,
 156,
 71,
 328,
 182,
 170,
 178,
 80,
 165,
 556,
 69,
 250,
 68,
 33,
 522,
 142,
 344,
 113,
 306,
 307,
 503,
 302,
 347,
 81,
 308,
 227,
 183,
 304,
 83,
 241,
 229,
 330,
 506,
 387,
 75,
 189,
 358,
 188,
 82,
 399,
 128,
 433,
 508,
 72,
 398,
 179,
 396,
 531,
 434,
 92,
 517,
 219,
 252,
 78