# Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from matplotlib import pyplot
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import seaborn as sns

---

# Carga de datos

In [2]:
train_auctions = pd.read_csv("../../features/entrenar_auctions_final.csv")

In [3]:
train_auctions.set_index("device_id", inplace=True)

In [4]:
train_auctions = train_auctions.reindex(sorted(train_auctions.columns), axis=1)

---

In [5]:
X_auctions, y_auctions = train_auctions.drop("target", axis=1), train_auctions["target"]
X_train_auctions, X_test_auctions, y_train_auctions, y_test_auctions = \
    train_test_split(X_auctions, y_auctions, test_size=0.33, random_state=0)

dtrain_auctions = xgb.DMatrix(data=X_train_auctions, label=y_train_auctions)
dtest_auctions = xgb.DMatrix(data=X_test_auctions, label=y_test_auctions)

---

In [5]:
train_installs = pd.read_csv("../../features/entrenar_installs_final.csv")
train_installs.set_index("device_id", inplace=True)
train_installs = train_installs.reindex(sorted(train_installs.columns), axis=1)

---

In [28]:
X_installs, y_installs = train_installs.drop("target", axis=1), train_installs["target"]
X_train_installs, X_test_installs, y_train_installs, y_test_installs = \
    train_test_split(X_installs, y_installs, test_size=0.33, random_state=0)

dtrain_installs = xgb.DMatrix(data=X_train_installs, label=y_train_installs)
dtest_installs = xgb.DMatrix(data=X_test_installs, label=y_test_installs)

---

# Entrenamiento - Auctions

In [8]:
space = {
        'n_estimators': hp.choice('n_estimators', np.arange(100, 500+1)),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 9, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 0.5, 6, 1),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 6,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}


def objective(space):
    model = xgb.train(space, dtrain_auctions,evals=[(dtest_auctions, "test")], verbose_eval=False, )
    pred = model.predict(dtest_auctions)
    rmse = np.sqrt(mean_squared_error(y_test_auctions, pred))
    return{'loss':rmse, 'status': STATUS_OK }


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print(best)
#100%|██████████| 100/100 [06:18<00:00,  4.06s/it, best loss: 66516.76448390858]
# {'colsample_bytree': 0.9500000000000001, 'eta': 0.35000000000000003, 'gamma': 0.9, 'max_depth': 6, 'min_child_weight': 2.0, 'n_estimators': 168}

100%|██████████| 100/100 [06:18<00:00,  4.06s/it, best loss: 66516.76448390858]
{'colsample_bytree': 0.9500000000000001, 'eta': 0.35000000000000003, 'gamma': 0.9, 'max_depth': 6, 'min_child_weight': 2.0, 'n_estimators': 168}


---

# Entrenamiento - installs

In [29]:
space = {
        'n_estimators': hp.choice('n_estimators', np.arange(100, 500+1)),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 9, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 0.5, 6, 1),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 6,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}


def objective(space):
    model = xgb.train(space, dtrain_installs, evals=[(dtest_installs, "test")], verbose_eval=False, )
    pred = model.predict(dtest_installs)
    rmse = np.sqrt(mean_squared_error(y_test_installs, pred))
    return{'loss':rmse, 'status': STATUS_OK }


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print(best)
#100%|██████████| 100/100 [01:02<00:00,  2.16it/s, best loss: 76354.42748888404]
#{'colsample_bytree': 0.8500000000000001, 'eta': 0.4, 'gamma': 0.65, 'max_depth': 2, 'min_child_weight': 2.0, 'n_estimators': 67}

 15%|█▌        | 15/100 [00:07<00:35,  2.39it/s, best loss: 76407.71525081745]


KeyboardInterrupt: 

---

# Prediccion

### Cargo features de los dispositivos de la competencia

In [6]:
predecir_auctions = pd.read_csv("../../features/predecir_auctions_final.csv", index_col=0)
predecir_auctions = predecir_auctions.reindex(sorted(predecir_auctions.columns), axis=1)

In [10]:
dpredecir_auctions = xgb.DMatrix(data=predecir_auctions)

In [7]:
predecir_installs = pd.read_csv("../../features/predecir_installs_final.csv", index_col=0)
predecir_installs = predecir_installs.reindex(sorted(predecir_installs.columns), axis=1)

In [12]:
dpredecir_installs = xgb.DMatrix(data=predecir_installs)

---

In [13]:
dfinal_auctions = xgb.DMatrix(data=X_auctions, label=y_auctions)

In [14]:
# {'colsample_bytree': 0.9500000000000001, 'eta': 0.35000000000000003, 'gamma': 0.9, 'max_depth': 6, 'min_child_weight': 2.0, 'n_estimators': 168}
best_auctions = {
        'n_estimators': 168,
        'eta': 0.35,
        'max_depth': 6,
        'min_child_weight': 2.0,
        'gamma': 0.9,
        'colsample_bytree': 0.95,
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 6,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}
model_auctions = xgb.train(best_auctions, dfinal_auctions, verbose_eval=False)

In [15]:
predecir_auctions["label"] = model_auctions.predict(dpredecir_auctions)

---

In [30]:
dfinal_installs = xgb.DMatrix(data=X_installs, label=y_installs)

In [31]:
best_installs = {
        'n_estimators': 67,
        'eta': 0.4,
        'max_depth': 2,
        'min_child_weight': 2.0,
        'gamma': 0.65,
        'colsample_bytree': 0.85,
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 6,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}
model_installs = xgb.train(best_installs, dfinal_installs, verbose_eval=False)

In [18]:
predecir_installs["label"] = model_installs.predict(dpredecir_installs)

---

In [19]:
def corregir(x):
    if x > 259200:
        return 259200
    elif x < 0:
        return 0
    else:
        return x

In [20]:
predecir_auctions["label"] = predecir_auctions["label"].apply(lambda x: corregir(x))

In [21]:
predecir_installs["label"] = predecir_installs["label"].apply(lambda x: corregir(x))

---

### Cargo archivo de submissions

In [22]:
submissions = pd.read_csv("../../../../data/tp2/target.csv")

In [23]:
predecir_auctions.reset_index(inplace=True)

In [24]:
predecir_auctions["ref_hash"] = predecir_auctions["ref_hash"].apply(lambda x: str(x)+"_st")

In [25]:
predecir_installs.reset_index(inplace=True)

In [26]:
predecir_installs["ref_hash"] = predecir_installs["ref_hash"].apply(lambda x: str(x)+"_sc")

In [27]:
rename_dict_auctions = predecir_auctions.set_index('ref_hash').to_dict()['label']

In [28]:
rename_dict_installs = predecir_installs.set_index('ref_hash').to_dict()['label']

In [29]:
tmp = submissions.copy()

In [30]:
tmp["obj"] = tmp["ref_hash"]

In [31]:
tmp["obj"] = tmp["obj"].replace(rename_dict_auctions)

In [32]:
tmp["obj"] = tmp["obj"].replace(rename_dict_installs)

In [33]:
tmp.set_index("ref_hash").to_csv("submission3_con_filter_en_los_2.csv")

---

# Persistencia de modelos

In [38]:
model_installs.save_model("modelo_installs_xgb.model")

In [39]:
model_installs.dump_model("modelo_installs_xgb.dump", with_stats=True)

---

In [40]:
model_auctions.save_model("modelo_auctions_xgb.model")

In [41]:
model_auctions.dump_model("model_auctions_xgb.dump", with_stats=True)

---