In [1]:
# IMPORTS
import pandas as pd
import json

In [21]:
# CARGA DE DATOS
installs = pd.read_csv("../../data/installs.csv")

In [22]:
# CARGA DE DESCRIPCION DE COLUMNAS
with open("../../data/desc.json") as json_file:
    data = json.load(json_file)
installs_data = data.get("installs")

In [23]:
def imprimir_info_columna(columna, info):
    print(columna)
    for x, y in info.items():
        print("\t" + x + ": ", end="")
        print(y)
    

# Tipos para cada columna, cantidad de nulos

In [24]:
installs.dtypes

created                object
application_id          int64
ref_type                int64
ref_hash                int64
click_hash            float64
attributed               bool
implicit                 bool
device_countrycode      int64
device_brand          float64
device_model          float64
session_user_agent     object
user_agent             object
event_uuid             object
kind                   object
wifi                   object
trans_id               object
ip_address              int64
device_language       float64
dtype: object

In [34]:
installs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3412 entries, 0 to 3411
Data columns (total 18 columns):
created               3412 non-null datetime64[ns]
application_id        3412 non-null int64
ref_type              3412 non-null int64
ref_hash              3412 non-null int64
click_hash            0 non-null float64
attributed            3412 non-null bool
implicit              3412 non-null bool
device_countrycode    3412 non-null int64
device_brand          1047 non-null float64
device_model          3411 non-null float64
session_user_agent    3364 non-null object
user_agent            1729 non-null object
event_uuid            865 non-null object
kind                  865 non-null object
wifi                  1729 non-null object
trans_id              6 non-null object
ip_address            3412 non-null int64
device_language       3378 non-null float64
dtypes: bool(2), datetime64[ns](1), float64(4), int64(5), object(6)
memory usage: 433.2+ KB


In [44]:
installs.isnull().agg("sum")

created                  0
application_id           0
ref_type                 0
ref_hash                 0
click_hash            3412
attributed               0
implicit                 0
device_countrycode       0
device_brand          2365
device_model             1
session_user_agent      48
user_agent            1683
event_uuid            2547
kind                  2547
wifi                  1683
trans_id              3406
ip_address               0
device_language         34
dtype: int64

# • Para la columna created

In [25]:
imprimir_info_columna("created", installs_data.get("created"))

created
	desc: creation date for the install
	transformation: None


### La convierto a fecha

In [26]:
installs["created"] = pd.to_datetime(installs["created"])

In [59]:
fechas = installs["created"]

In [60]:
fechas.describe()

count                           3412
unique                          3412
top       2019-03-09 04:34:41.291000
freq                               1
first     2019-03-05 00:00:38.219000
last      2019-03-13 23:54:00.526000
Name: created, dtype: object

# • Para la columna application_id

In [28]:
imprimir_info_columna("application_id", installs_data.get("application_id"))

application_id
	desc: internal id for the installed app
	transformation: LabelEncoding


### Comentario: Label Encoding puede confundir a los algoritmos clasificadores

In [65]:
installs["application_id"].value_counts().size

31

### Se podría cargar como categórica, entero chico o incluso con One-Hot

# • Para la columna ref_type

In [30]:
imprimir_info_columna("ref_type", installs_data.get("ref_type"))

ref_type
	desc: either apple_ifa or google_advertising_id
	transformation: string hashing


In [69]:
installs["ref_type"].value_counts().size

2

### Podría cargarse como categórica o One-Hot

# • Para la columna ref_hash

In [45]:
imprimir_info_columna("ref_hash", installs_data.get("ref_hash"))

ref_hash
	desc: device's apple_ifa or google_advertising_id
	transformation: string hashing


In [66]:
installs["ref_hash"].value_counts().size

3008

### Hay 3008 identificadores (dos diferentes podrían corresponder a un mismo dispositivo). One-Hot Encoding ya no parece viable, pero podría reducirse el tamaño si se carga como categórica o entera chica

# • Para la columna click_hash

In [46]:
imprimir_info_columna("click_hash", installs_data.get("click_hash"))

click_hash
	desc: hash for the install (like an id)
	transformation: None


In [47]:
installs["click_hash"].describe()

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: click_hash, dtype: float64

### Comentario: Todos los valores son nulos, podría descartarse

# • Para la columna attributed

In [51]:
imprimir_info_columna("attributed", installs_data.get("attributed"))

attributed
	desc: whether the install was attributed to Jampp
	transformation: None


In [52]:
df["attributed"].head()

0    False
1    False
2    False
3    False
4    False
Name: attributed, dtype: bool

# • Para la columna implicit

In [53]:
imprimir_info_columna("implicit", installs_data.get("implicit"))

implicit
	desc: whether the install is implicit (the install was made by a device that has not installed according to the tracking platform)
	transformation: None


# • Para la columna device_countrycode

In [56]:
imprimir_info_columna("device_countrycode", installs_data.get("device_countrycode"))

device_countrycode
	desc: country code for the device
	transformation: string hashing


In [67]:
installs["device_countrycode"].value_counts().size

2

### Podría codificarse mediante One-Hot. float64 no tiene sentido

# • Para la columna device_brand

In [61]:
imprimir_info_columna("device_brand", installs_data.get("device_brand"))

device_brand
	desc: device's brand
	transformation: string hashing


In [62]:
installs["device_brand"].head()

0    3.083059e+17
1    3.083059e+17
2    5.137992e+17
3    5.137992e+17
4    1.083369e+18
Name: device_brand, dtype: float64

In [70]:
installs["device_brand"].value_counts().size

27

### Hay 27 "brands" diferentes, podrían cargarse según One-Hot o categóricas. Se entiende que las "brands" o marcas son Samsung, Motorola, Xiaomi, etc.

# • Para la columna device_model

In [71]:
imprimir_info_columna("device_model", installs_data.get("device_model"))

device_model
	desc: device's model
	transformation: string hashing


In [72]:
installs["device_brand"].value_counts().size

27

### Teniendo en cuenta que no puede haber menos modelos que marcas, y cada marca tiene un modelo, esta columna parece ser redundante (con la marca o "brand" ya alcanza)

# • Para la columna session_user_agent

In [73]:
imprimir_info_columna("session_user_agent", installs_data.get("session_user_agent"))

session_user_agent
	desc: user-agent used for the install
	transformation: None


In [74]:
installs["session_user_agent"].value_counts().size

12

In [75]:
installs["session_user_agent"].isnull().sum()

48

In [81]:
installs["session_user_agent"].value_counts()

http-kit/2.0                                                                                                                                                                                                                                                                  1729
adjust.com                                                                                                                                                                                                                                                                    1592
HasOffers Mobile AppTracking v1.0                                                                                                                                                                                                                                               22
Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1                                      

# • Para la columna user_agent

In [82]:
imprimir_info_columna("user_agent", installs_data.get("user_agent"))

user_agent
	desc: user agent related to the device
	transformation: None


In [83]:
installs["user_agent"].value_counts().size

334

In [84]:
installs["user_agent"].isnull().sum()

1683

In [85]:
installs["user_agent"].value_counts().head(7)

MercadoPago/2.58.0.2 CFNetwork/976 Darwin/18.2.0                  393
MercadoPago/2.59.0.1 CFNetwork/976 Darwin/18.2.0                  141
MercadoPago/2.57.0.2 CFNetwork/976 Darwin/18.2.0                   89
Dalvik/2.1.0 (Linux; U; Android 8.1.0; SM-G610M Build/M1AJQ)       37
Dalvik/2.1.0 (Linux; U; Android 6.0.1; SM-J500M Build/MMB29M)      36
Dalvik/2.1.0 (Linux; U; Android 6.0.1; SM-G532MT Build/MMB29T)     36
Dalvik/2.1.0 (Linux; U; Android 8.0.0; SM-G570M Build/R16NW)       28
Name: user_agent, dtype: int64

# • Para la columna event_uuid

In [86]:
imprimir_info_columna("event_uuid", installs_data.get("event_uuid"))

event_uuid
	desc: uuid4 generated for the event
	transformation: None


In [89]:
installs["event_uuid"].value_counts().size

865

In [90]:
installs["event_uuid"].isnull().sum()

2547

In [94]:
installs["event_uuid"].value_counts().head(3)

fec15d42-3486-4ed9-838c-ea1aef7f5111    1
ecfe6061-4012-4135-9fa9-8414b4caefdc    1
7649efcd-851e-40f6-abe6-67f24a2e1fcc    1
Name: event_uuid, dtype: int64

In [95]:
installs["event_uuid"].describe()

count                                      865
unique                                     865
top       fec15d42-3486-4ed9-838c-ea1aef7f5111
freq                                         1
Name: event_uuid, dtype: object

### Podría cargarse como categórica, o entero pequeño ya que sus valores no se repiten

# • Para la columna kind

In [96]:
imprimir_info_columna("kind", installs_data.get("kind"))

kind
	desc: kind of install
	transformation: None


In [100]:
installs["kind"].value_counts().size

20

In [101]:
installs["kind"].value_counts()

af_app_opened                 428
app_open                      212
EVENT_Homepage                 70
open                           36
EVENT_OPEN_APP                 33
Open                           22
login_success                  17
account_summary_first_step     13
EVENT_HOMEPAGE                  6
EVENT_LISTINGVIEW_FLIGHT        6
deeplink                        5
reattribution                   4
payment method add              3
registration                    3
journey first drop off          2
af_list_view                    1
EVENT_PROMO                     1
journey drop off                1
install                         1
journey reserved                1
Name: kind, dtype: int64

In [99]:
installs["kind"].isnull().sum()

2547

### Podría cargarse como categórica o entero pequeño

# • Para la columna wifi

In [102]:
imprimir_info_columna("wifi", installs_data.get("wifi"))

wifi
	desc: boolean flagging if the install was performed on a wifi connection
	transformation: None


In [104]:
installs["wifi"].value_counts()

True     1377
False     352
Name: wifi, dtype: int64

In [105]:
installs["wifi"].isnull().sum()

1683

### Debería cargarse como booleana. Por otro lado, tiene muchos nulos

# • Para la columna trans_id

In [106]:
imprimir_info_columna("trans_id", installs_data.get("trans_id"))

trans_id
	desc: transaction id, used for joining
	transformation: None


In [107]:
installs["trans_id"].value_counts()

{hash}                                  3
22380348598                             1
89bda305-bff1-4c22-ae23-1a237eac0002    1
44aeb6df-8db4-4b3f-ac67-0a802d479c86    1
Name: trans_id, dtype: int64

In [109]:
installs["trans_id"].isnull().sum()

3406

### La mayoria de sus valores son nulos. Habría que ver a qué se refiere con la descripción de la columna

# • Para la columna ip_address

In [110]:
imprimir_info_columna("ip_address", installs_data.get("ip_address"))

ip_address
	desc: ip addres through which the install was performed
	transformation: string hashing


In [111]:
installs["ip_address"].value_counts()

1992632945768888579    33
2403839579048525602    26
2394721645995561079    17
589886713227299026     16
8262991154202232888    14
8010533080341024188    13
8531002764348621869    12
7726314782926052608    12
7481987231813232849    11
2441645233986204328    11
2706357072127963511     9
3162368506662488065     9
6586849487961128061     7
7052859104363282878     7
9040150555093642379     7
627974920620404284      7
4895833194809562361     7
4371364616797404439     6
3222899837426106171     5
1037699812701179191     5
5437394128393850342     5
739514463662096135      5
6324037615828123965     5
8235954648733428120     5
7950883564880561905     5
3927367081536317814     5
7174799503430545910     4
2988221197536288219     4
6452955265823183819     4
1933486918445844180     4
                       ..
1488715796382824376     1
8582476476054892626     1
7766233397987027039     1
2978724023808640249     1
7488499899710199132     1
1515684618293858224     1
8370884033418651464     1
708253280447

### No tiene valores nulos, pero sería interesante ver cuántos dispositivos se conectaron con varias IP. También es raro que haya 33 instalaciones de una misma IP

# • Para la columna device_language

In [114]:
imprimir_info_columna("device_language", installs_data.get("device_language"))

device_language
	desc: language related to the device
	transformation: string hashing


In [119]:
installs["device_language"].value_counts().size

30

In [120]:
installs["device_language"].isnull().sum()

34

### Casi nungún valor es null, hay 30 países diferentes.