# Imports

In [1]:
import pandas as pd
import numpy as np

---

In [2]:
# Columnas a cargar
columnas_cargar = ["date", "event_id","ref_type","ref_hash","application_id","attributed", "session_user_agent", "kind", "wifi"]
# Definicion de tipos
tipos = {"event_id" : "int16",
         "ref_hash" : "int64",
         "application_id" : "int16",
         "attributed": "bool",
         "wifi":"bool"}

events = pd.read_csv("../../../../data/tp2/events.csv", usecols=columnas_cargar, dtype=tipos, parse_dates=["date"])


### Formateo el sistema operativo

In [3]:
events['os'] = 1 * ((events["ref_type"] == 1891515180541284343).astype('int8'))

In [4]:
events.drop(columns = {"ref_type"},inplace = True)

In [5]:
events["os"].value_counts()

1    6421584
0    1322997
Name: os, dtype: int64

### 1 ---> ANDROID 
### 0 ---> IOS

In [6]:
events.sort_values(by = "date",inplace = True)

In [7]:
events["dayofweek"] = (events["date"].dt.dayofweek).astype("int8")

In [8]:
def armar_ventanas(dataset,col_fecha):
    """Recibe el dataset y una string indicando el 
    nombre de la columna que lleva la fecha"""
    
    ventanas = []
    for x in range(5):
        ventanas.append((dataset.loc[((18+x) <= dataset[col_fecha].dt.day) & (dataset[col_fecha].dt.day <= (20+x))]))
    return ventanas

In [9]:
def target_ventanas(ventanas):
    entrenamientos = []
    for ventana in ventanas:
        ent = ventana["ref_hash"].drop_duplicates().to_frame()
        ent.rename(columns ={"ref_hash":"device_id"},inplace = True)
        ent.set_index("device_id", inplace=True)
        entrenamientos.append(ent)
    return entrenamientos

In [10]:
ventanas = armar_ventanas(events,"date")

In [11]:
targets = target_ventanas(ventanas)

## Comienzo a trabajar con la primer ventana

In [12]:
ventana1 = ventanas[0]

In [13]:
entrenamiento1 = targets[0]

In [14]:
ventana1.head()

Unnamed: 0,date,event_id,ref_hash,application_id,attributed,session_user_agent,kind,wifi,os,dayofweek
170092,2019-04-18 00:00:00.027,13,4314242242878368557,65,False,3.819516e+18,4.853173e+18,True,1,3
947821,2019-04-18 00:00:00.071,158,2068670382964280534,230,False,3.819516e+18,8.370513e+18,True,0,3
1724915,2019-04-18 00:00:00.102,15,726218951290512705,65,False,3.819516e+18,5.500848e+18,True,1,3
2794789,2019-04-18 00:00:00.309,0,1329210878469536477,263,False,3.819516e+18,5.882882e+18,False,1,3
5526010,2019-04-18 00:00:00.486,2,5661732594213028190,210,False,3.819516e+18,6.168309e+18,False,1,3


## Promedio de eventos por día

In [15]:
feature = ventana1.groupby(["ref_hash","dayofweek"]).size()\
                  .groupby("ref_hash").mean()\
                  .to_frame().rename(columns = {0:"promedio_eventos"})

In [16]:
feature["promedio_eventos"] = feature["promedio_eventos"].astype("int64")

In [17]:
entrenamiento1 = entrenamiento1.merge(feature, how = 'left',left_index = True,right_index = True)

## Evento más frecuente

In [18]:
feature = ventana1.groupby(["ref_hash","event_id"]).size().to_frame()
                  

In [19]:
idx = feature.groupby(['ref_hash'])[0].transform(max) == feature[0]
feature = feature[idx]

In [20]:
feature.rename(columns= {0:"cant_ocurrencias"},inplace = True)

In [21]:
feature.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cant_ocurrencias
ref_hash,event_id,Unnamed: 2_level_1
41863526108385,1,52
161514654074162,23,4
186034136943920,1,10
360710529886978,23,2
365882020742330,15,41


### ¿Hay dispositivos que tengan más de un evento como máximo ?

In [25]:
((feature.groupby(["ref_hash"]).count()["cant_ocurrencias"]) > 1).sum()

27801

### ¿Cual es el máximo numero de eventos empatados como máximo por dispositivo?

In [26]:
(feature.groupby(["ref_hash"]).count()).max()

cant_ocurrencias    10
dtype: int64

### ¿Qué criterio se toma para decidir con cual evento quedarse? 

#### Voy a crear una lista ordenada de los eventos más presentes por dispositivo, contando un solo dispositivo por evento para no desbalancear si un dispositivo hace 10000 el mismo evento. Esto podría dar un orden de importancia entre eventos

In [203]:
top_eventos = ventana1.groupby(["ref_hash","event_id"]).size().reset_index()\
                      .groupby("event_id").size().sort_values(ascending = False)

In [204]:
top_eventos.head()

event_id
23    63062
15    45078
7     32720
0     25747
2     20475
dtype: int64

In [202]:
top_eventos = top_eventos.index.to_list()

## Último evento

In [75]:
idx = ventana1.groupby(['ref_hash'])["date"].transform(max) == ventana1["date"]
feature = ventana1[idx]

In [76]:
len(feature)

166018

In [77]:
feature["ref_hash"].nunique()

166002

### Hay valores que tienen la misma fecha con el mismo evento, pero distinto kind, por ejemplo: 

In [64]:
feature[feature["ref_hash"] == 2831641632794815413]

Unnamed: 0,date,event_id,ref_hash,application_id,attributed,session_user_agent,kind,wifi,os,dayofweek
7488551,2019-04-20 06:55:29.193,109,2831641632794815413,116,False,5.783151e+18,1.020653e+18,False,0,5
7488552,2019-04-20 06:55:29.193,108,2831641632794815413,116,False,5.783151e+18,2.37675e+18,False,0,5


In [65]:
feature.groupby("ref_hash").size().sort_values(ascending = False).head()

ref_hash
2831641632794815413    2
4088908661713804281    2
8328199773971797530    2
3511963389841035668    2
6044545373594806384    2
dtype: int64

#### Solo me importa el evento así que descarto al azar

In [87]:
feature = feature[["ref_hash","date","event_id","application_id"]].drop_duplicates("ref_hash")

In [92]:
feature.rename(columns = {"ref_hash":"device_id","event_id":"ult_ev","date":"fecha_ult_ev","application_id":"ult_app"},inplace = True)

In [94]:
feature.set_index("device_id",inplace = True)

In [95]:
feature.head()

Unnamed: 0_level_0,fecha_ult_ev,ult_ev,ult_app
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6444651396290009624,2019-04-18 00:00:02.052,245,21
3601178220858933758,2019-04-18 00:00:07.957,246,21
3233293871630648142,2019-04-18 00:00:09.612,15,65
4452972490570146531,2019-04-18 00:00:12.303,7,122
5451417563610992673,2019-04-18 00:00:19.597,106,121


In [96]:
entrenamiento1 = entrenamiento1.merge(feature, how = 'left',left_index = True,right_index = True)

## ¿Cúal es la hora del día con más actividad para cada usuario ? 

In [101]:
ventana1["hora"] = ventana1["date"].dt.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [105]:
feature = ventana1.groupby(["ref_hash","hora"]).size()
idx = feature.groupby(['ref_hash']).transform(max) == feature
feature = feature[idx]

In [122]:
feature = feature.to_frame().reset_index()

In [124]:
len(feature)

193332

In [125]:
feature["ref_hash"].nunique()

166002

### Hay dispositivos que comparten varias horas, habrá que decidir el criterio de elección

In [126]:
feature.head()

Unnamed: 0,ref_hash,hora,0
0,41863526108385,5,64
1,161514654074162,17,8
2,186034136943920,16,13
3,360710529886978,18,2
4,365882020742330,18,8
