# Imports

In [1]:
import pandas as pd
import numpy as np

---

# Carga de datos

In [2]:
installs = pd.read_pickle("../../../../data/tp2/installs_tp2_formateado.pkl")

---

# Armado de ventanas

In [3]:
for ventana_nro in range(1,6):
    installs["ventana_{}".format(ventana_nro)] = (((17+ventana_nro) <= installs["created"].dt.day) & ((installs["created"].dt.day) <= (19+ventana_nro)))

---

# Creación de features

## Creación de sets de entrenamiento

In [4]:
Xs = {}
for ventana_nro in range(1,6):
    Xs[ventana_nro] = installs.loc[installs["ventana_{}".format(ventana_nro)]]["ref_hash"].drop_duplicates().to_frame().set_index("ref_hash")

---

## Función generadora de features

In [5]:
def generar_feature_en_ventanas(dataframe, generador_feature, destinos):
    for ventana_nro in range(1, 6):
        feature = generador_feature(dataframe.loc[dataframe["ventana_{}".format(ventana_nro)]], ventana_nro)
        destinos[ventana_nro] = destinos[ventana_nro].merge(feature, left_index=True, right_index=True)

---

## Cantidad de instalaciones dentro de la ventana

In [6]:
def cantidad_de_instalaciones(dataframe, nro_ventana):
    return dataframe.groupby("ref_hash").agg({"created" : "count"}).rename(columns={"created": "cantidad_instalaciones"})

In [8]:
generar_feature_en_ventanas(installs, cantidad_de_instalaciones, Xs)

---

## Tiempo desde la última aparición hasta el fin de la ventana

In [20]:
def tiempo_desde_ult_instalacion_hasta_fin_ventana(dataframe, nro_ventana):
    return (np.datetime64("2019-04-2{}".format(nro_ventana)) - dataframe.groupby("ref_hash").agg({"created" : "max"})["created"]).dt.total_seconds().to_frame().rename(columns={"created": "tiempo_ultima_instalacion_hasta_fin_ventana"})

In [21]:
generar_feature_en_ventanas(installs, tiempo_desde_ult_instalacion_hasta_fin_ventana, Xs)

---

## Cantidad de aplicaciones distintas instaladas

In [26]:
def cantidad_aplicaciones_distintas_instaladas(dataframe, nro_ventana):
    return dataframe.groupby("ref_hash").agg({"application_id" : "nunique"}).rename(columns={"application_id": "cantidad_aplicaciones_diferentes"})

In [27]:
generar_feature_en_ventanas(installs, cantidad_aplicaciones_distintas_instaladas, Xs)

---

In [None]:
def promedio_hora_instalacion(dataframe, nro_ventana):
    return dataframe.groupby("ref_hash").agg({"application_id" : "nunique"}).rename(columns={"application_id": "cantidad_aplicaciones_diferentes"})

# Prueba momentanea, baseline para ver que tal anda con solo la cantidad de instalaciones

In [76]:
features_v1_auctions = pd.read_pickle("../../features/auctions_train_ventana_1.pkl")

In [77]:
features_v1_events = pd.read_pickle("../../features/events_train_ventana_1.pkl")
features_v1_clicks = pd.read_pickle("../../features/clicks_train_ventana_1.pkl")

In [79]:
features_v1 = Xs[1]
features_v1 = features_v1.merge(features_v1_auctions, left_index=True, right_index=True, suffixes=("_a", "_b"))
features_v1 = features_v1.merge(features_v1_events, left_index=True, right_index=True, suffixes=("_c", "_d"))
features_v1 = features_v1.merge(features_v1_clicks, left_index=True, right_index=True, suffixes=("_e", "_f"))

In [103]:
competencia = pd.read_pickle("../../../../data/tp2/ref_hashes_target.pkl")

In [107]:
features_v1 = Xs[1]

In [155]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



targets = pd.read_pickle("../../targets/targets_ventana_1_installs.pkl")
train = features_v1.merge(targets, left_index=True, right_index=True)

X, y = train.iloc[:,:-1], train.iloc[:,-1]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

param = {'max_depth':3,
         'eta':0.3,
         'silent':1,
         'objective':'reg:linear',
         'eval_metric' : 'rmse',
         'obj' : 'reg:squarederror',
         'colsample_bytree' : 0.7,
         'min_child_weight' : 0.5,
         'random_state' : 0,
         'reg_alpha' : 0,
         'reg_lambda' : 1,
         'scale_pos_weight' : 14,
         'learning_rate' : 0.1}


num_round = 5000
stopping = 20
bst = xgb.train(param, dtrain, num_round, evals=[(dtrain, "train"), (dtest, "test")], early_stopping_rounds=stopping)

[0]	train-rmse:231187	test-rmse:231121
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 20 rounds.
[1]	train-rmse:208402	test-rmse:208341
[2]	train-rmse:187932	test-rmse:187877
[3]	train-rmse:169549	test-rmse:169500
[4]	train-rmse:153049	test-rmse:153006
[5]	train-rmse:138247	test-rmse:138212
[6]	train-rmse:124980	test-rmse:124953
[7]	train-rmse:113097	test-rmse:113080
[8]	train-rmse:102467	test-rmse:102460
[9]	train-rmse:92969.9	test-rmse:92974.5
[10]	train-rmse:84498.2	test-rmse:84514.4
[11]	train-rmse:76955.5	test-rmse:76984.9
[12]	train-rmse:70254.9	test-rmse:70298.3
[13]	train-rmse:64316.5	test-rmse:64376.6
[14]	train-rmse:59069.9	test-rmse:59147.5
[15]	train-rmse:54450.6	test-rmse:54546.5
[16]	train-rmse:50399.6	test-rmse:50515.6
[17]	train-rmse:46861.9	test-rmse:46998.2
[18]	train-rmse:43787.3	test-rmse:43945.1
[19]	train-rmse:41128.4	test-rmse:41307.2
[20]	train-rmse:38842.3	test-rmse:39040.6
[21

In [96]:
import seaborn as sns

In [110]:
reales = targets.merge(competencia, left_index=True, right_index=True)["target"]

In [113]:
pre_rmse = reales - 259200

In [117]:
sqrt(pre_rmse.apply(lambda x: x**2).sum()/len(pre_rmse))

44349.72172890477

In [127]:
de_la_comp = competencia.merge(Xs[1], left_index=True, right_index=True).merge(targets, left_index=True, right_index=True)

In [134]:
de_la_comp["instalo"] = de_la_comp["instalo"].astype(np.int8)

In [156]:
model = xgb.XGBClassifier(scale_pos_weight=13)
X, y = de_la_comp.iloc[:,:-2], de_la_comp["instalo"]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.5, random_state=123)

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=13, seed=None,
       silent=True, subsample=1)

In [157]:
predichos = model.predict(X_test)

  if diff:


In [158]:
y_test.value_counts()

0    616
1     46
Name: instalo, dtype: int64

In [159]:
pd.Series(predichos).value_counts()

0    503
1    159
dtype: int64

In [164]:
pd.Series(pd.Series(predichos).values == y_test.values).value_counts()

True     479
False    183
dtype: int64