# Analisis de superficial del csv de clicks
### En este notebook exploramos los registros del csv con el objetivo de buscar relaciones entre las distintas variables

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Leemos el csv, y dropeamos las columnas action_id por estar llena de nans y wifi_connection por estar llena de False

In [2]:
clicks = pd.read_csv("../data/clicks.csv", dtype={'advertiser_id': 'category', 'action_id': 'float64', 'source_id': 'category',
                                               'created': 'str', 'country_code':'category', 'latitude': 'float64',
                                               'longitude':'float64', 'wifi_connection':'bool' , 'carrier_id':np.float16,
                                               'trans_id':'str', 'os_minor':'float64', 'agent_device':'float64', 'os_major':'float64',
                                               'specs_brand':'int64', 'brand':'category', 'timeToClick':'float64',
                                               'ref_type':'category', 'ref_hash':'int64'}, parse_dates = ['created'])
clicks.drop('action_id', axis=1, inplace = True)
clicks.drop('wifi_connection', axis=1, inplace = True) #false
clicks['day'] = clicks['created'].dt.date
clicks['hour'] = clicks['created'].dt.hour
clicks.head(5)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,advertiser_id,source_id,created,country_code,latitude,longitude,carrier_id,trans_id,os_minor,agent_device,os_major,specs_brand,brand,timeToClick,touchX,touchY,ref_type,ref_hash,day,hour
0,1,2,2019-04-18 05:27:42.197000+00:00,6287817205707153877,1.714547,0.871535,3.0,9JMAfrb-b9cSEVCJb0P9JfihGthaS7E,1.517644e+18,,5.131616e+18,71913840936116953,0.0,2.317,0.968,0.503,1891515180541284343,1293710398598742392,2019-04-18,5
1,1,1,2019-04-18 05:27:03.164000+00:00,6287817205707153877,1.714512,0.871062,2.0,r3xtTRv2lInfiXG8JI3NQsNcBo8GyFQ,1.288578e+18,,3.90839e+18,3576558787748411622,1.0,7.653,0.712,1.689,1891515180541284343,1663930990551616564,2019-04-18,5
2,1,1,2019-04-18 05:42:07.926000+00:00,6287817205707153877,1.714547,0.871535,4.0,WOnHFqQtY48z_ygKZ-030U_g0TMGVMw,2.238736e+18,,3.581233e+18,3576558787748411622,,464.796,0.227,0.251,1891515180541284343,8488038938665586188,2019-04-18,5
3,1,1,2019-04-18 05:26:04.446000+00:00,6287817205707153877,1.708041,0.870772,1.0,wQMLLmYqiFhSuha9p9B13PMtcyBW_vM,2.41164e+18,,3.90839e+18,3576558787748411622,,225.311,0.696,6.587,1891515180541284343,6488361690105189959,2019-04-18,5
4,1,1,2019-04-18 05:23:37.764000+00:00,6287817205707153877,1.715514,0.870772,2.0,GeFoyBzMA7taylMxxjzlNPTU-n4FXFs,1.517644e+18,,5.131616e+18,3576558787748411622,0.0,84.736,0.059,0.142,1891515180541284343,1348993302102753419,2019-04-18,5


In [3]:
clicks.dtypes

advertiser_id               category
source_id                   category
created          datetime64[ns, UTC]
country_code                category
latitude                     float64
longitude                    float64
carrier_id                   float16
trans_id                      object
os_minor                     float64
agent_device                 float64
os_major                     float64
specs_brand                    int64
brand                       category
timeToClick                  float64
touchX                        object
touchY                        object
ref_type                    category
ref_hash                       int64
day                           object
hour                           int64
dtype: object

## Realizamos un histograma del TimeToClick

In [4]:
clicks_acotado = clicks.dropna(subset= ['timeToClick'])

## Creamos una columna categorica del TimeToClick

In [5]:
bins = pd.IntervalIndex.from_tuples([(600, 120000), (180, 600), (60, 180), (15, 60), (3, 15), (0, 3)])
clicks_acotado['time_categ'] = pd.cut(clicks_acotado['timeToClick'],bins)
clicks_acotado['time_categ'].cat.categories = ['10 a 2000min','3 a 10min','1 a 3 min','15 a 60seg','3 a 15seg','0 a 3 seg']

clicks['time_categ'] = pd.cut(clicks['timeToClick'],bins)
clicks['time_categ'].cat.categories = ['10 a 2000min','3 a 10min','1 a 3 min','15 a 60seg', '3 a 15seg','0 a 3 seg']

clicks_acotado['time_categ'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


0 a 3 seg       12856
3 a 15seg        9285
15 a 60seg       6707
1 a 3 min        3992
3 a 10min        2907
10 a 2000min     2431
Name: time_categ, dtype: int64

## Elegimos las columnas con menor entropia o pocos valores diferentes y nos quedamos con los 5 valores mas comunes

In [6]:
grupos = ['advertiser_id', 'carrier_id', 'os_minor', 'agent_device', 'os_major', 'specs_brand', 'ref_type', 'hour', 'day', 'time_categ']
top5 = {}
for i in grupos:
    top5[i]= clicks[i].value_counts().head(5).index.tolist()
top5

{'advertiser_id': ['2', '1', '0', '3', '4'],
 'agent_device': [6.794880020077884e+18,
  9.17380693425856e+18,
  9.186120447236368e+18,
  1.9540924917876943e+18,
  2.8270958855039072e+17],
 'carrier_id': [1.0, 7.0, 2.0, 0.0, 13.0],
 'day': [datetime.date(2019, 4, 26),
  datetime.date(2019, 4, 25),
  datetime.date(2019, 4, 23),
  datetime.date(2019, 4, 24),
  datetime.date(2019, 4, 22)],
 'hour': [3, 4, 2, 1, 16],
 'os_major': [5.131615556736863e+18,
  3.9083902007568794e+18,
  5.754947116114108e+18,
  5.648867414868049e+18,
  3.5812325749809167e+18],
 'os_minor': [1.5176438893491397e+18,
  6.795761880764845e+18,
  3.5759630297247805e+18,
  1.2885781261232225e+18,
  7.531669329342817e+18],
 'ref_type': ['1891515180541284343', '1494519392962156891'],
 'specs_brand': [71913840936116953,
  3576558787748411622,
  6341583823913642480,
  784329784168794382,
  4222063286888578800],
 'time_categ': ['0 a 3 seg',
  '3 a 15seg',
  '15 a 60seg',
  '1 a 3 min',
  '3 a 10min']}

In [7]:
clicks = clicks.sort_values(by = 'created')
clicks = clicks.sort_values(by = 'ref_hash')
clicks['cumcount'] = 1
device_ids = clicks['ref_hash'].to_frame()
clicks = pd.get_dummies(clicks, columns = grupos)

In [8]:
fecha_minima=pd.to_datetime('2019-04-18 00:00:00', utc = True)
fecha_minima
fecha_tope=pd.to_datetime('2019-04-21 00:00:00', utc = True)
clicks_ventana1=clicks.loc[((clicks['created']>fecha_minima) & (clicks['created']<fecha_tope) )].copy(deep=False)
features_ventana1 = clicks_ventana1.groupby('ref_hash').agg('cumsum').join(device_ids, how = 'inner').set_index('ref_hash')

In [9]:
fecha_minima=pd.to_datetime('2019-04-21 00:00:00', utc = True)
fecha_minima
fecha_tope=pd.to_datetime('2019-04-24 00:00:00', utc = True)
clicks_ventana4=clicks.loc[((clicks['created']>fecha_minima) & (clicks['created']<fecha_tope) )].copy(deep=False)
features_ventana4 = clicks_ventana4.groupby('ref_hash').agg('cumsum').join(device_ids, how = 'inner').set_index('ref_hash')

In [10]:
fecha_minima=pd.to_datetime('2019-04-24 00:00:00', utc = True)
fecha_minima
fecha_tope=pd.to_datetime('2019-04-27 00:00:00', utc = True)
clicks_ventana7=clicks.loc[((clicks['created']>fecha_minima) & (clicks['created']<fecha_tope) )].copy(deep=False)
features_ventana7 = clicks_ventana7.groupby('ref_hash').agg('cumsum').join(device_ids, how = 'inner').set_index('ref_hash')

In [11]:
train_features = features_ventana4
train_features.to_csv('../xgb/train_features_clicks_gonzalo.csv')

In [12]:
test_features = features_ventana7
test_features.to_csv('../xgb/test_features_clicks_gonzalo.csv')

In [13]:
train_features

Unnamed: 0_level_0,latitude,longitude,timeToClick,cumcount,advertiser_id_0,advertiser_id_1,advertiser_id_2,advertiser_id_3,advertiser_id_4,carrier_id_0.0,...,day_2019-04-23,day_2019-04-24,day_2019-04-25,day_2019-04-26,time_categ_10 a 2000min,time_categ_3 a 10min,time_categ_1 a 3 min,time_categ_15 a 60seg,time_categ_3 a 15seg,time_categ_0 a 3 seg
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
693609737448534,1.729651,0.849176,40.210,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,1,0,0
693609737448534,3.459301,1.698351,,2,0,0,2,0,0,0,...,1,0,0,0,0,0,0,1,0,0
2204225481747532,1.714512,0.871062,2.963,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10954770305551958,1.744788,0.859627,0.195,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
12552176992342593,1.714512,0.871062,2.511,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
12552176992342593,3.429023,1.742124,155.309,2,0,1,1,0,0,1,...,1,0,0,0,0,0,1,0,0,1
12552176992342593,5.143535,2.613186,289.450,3,0,2,1,0,0,1,...,1,0,0,0,0,0,2,0,0,1
12552176992342593,6.858046,3.484248,386.223,4,0,3,1,0,0,1,...,1,0,0,0,0,0,3,0,0,1
12552176992342593,8.572558,4.355310,410.977,5,0,4,1,0,0,1,...,1,0,0,0,0,0,3,1,0,1
12552176992342593,10.287069,5.226372,413.056,6,0,5,1,0,0,2,...,2,0,0,0,0,0,3,1,0,2


In [14]:
test_features

Unnamed: 0_level_0,latitude,longitude,timeToClick,cumcount,advertiser_id_0,advertiser_id_1,advertiser_id_2,advertiser_id_3,advertiser_id_4,carrier_id_0.0,...,day_2019-04-23,day_2019-04-24,day_2019-04-25,day_2019-04-26,time_categ_10 a 2000min,time_categ_3 a 10min,time_categ_1 a 3 min,time_categ_15 a 60seg,time_categ_3 a 15seg,time_categ_0 a 3 seg
ref_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7164788605058735,1.735196,0.861823,2.175,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
7429113196145773,1.792518,0.866689,13.488,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,1,0
8452408857001723,1.701285,0.869528,0.570,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
8577232270715133,1.712736,0.869157,,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
10261987748713353,1.714241,0.860456,0.631,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
12173926012891980,1.714512,0.871062,1.755,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
12240396390252142,1.884068,0.810355,0.239,1,0,1,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
12552176992342593,1.714512,0.871062,56.411,1,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
12552176992342593,3.429023,1.742124,126.835,2,0,2,0,0,0,0,...,0,0,0,2,0,0,1,1,0,0
12552176992342593,5.143535,2.613186,132.119,3,0,3,0,0,0,0,...,0,0,1,2,0,0,1,1,1,0
