# Imports

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from matplotlib import pyplot
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import seaborn as sns

---

# Carga de datos

In [2]:
train_auctions = pd.read_csv("../../features/entrenar_auctions_final.csv")

In [3]:
train_auctions.set_index("device_id", inplace=True)

In [4]:
#train_auctions = train_auctions.reindex(sorted(train_auctions.columns), axis=1)

In [5]:
# Filtro
#train_auctions = train_auctions.loc[(train_auctions.T.agg("sum") != 0)]

---

In [6]:
X_auctions, y_auctions = train_auctions.drop("target", axis=1), train_auctions["target"]
X_train_auctions, X_test_auctions, y_train_auctions, y_test_auctions = \
    train_test_split(X_auctions, y_auctions, test_size=0.33, random_state=0)

"""filtro_auctions = (X_auctions.T.agg("sum") != 0)
X_auctions = X_auctions.loc[filtro_auctions]
y_auctions = y_auctions.loc[filtro_auctions]"""

dtrain_auctions = xgb.DMatrix(data=X_train_auctions, label=y_train_auctions)
dtest_auctions = xgb.DMatrix(data=X_test_auctions, label=y_test_auctions)

---

In [7]:
train_installs = pd.read_csv("../../features/entrenar_installs_final.csv")
train_installs.set_index("device_id", inplace=True)
#train_installs = train_installs.reindex(sorted(train_installs.columns), axis=1)

In [8]:
# Filtro
#train_installs = train_installs.loc[train_installs.T.agg("sum") != 0]

---

In [9]:
X_installs, y_installs = train_installs.drop("target", axis=1), train_installs["target"]
X_train_installs, X_test_installs, y_train_installs, y_test_installs = \
    train_test_split(X_installs, y_installs, test_size=0.33, random_state=0)

"""filtro_installs = (X_installs.T.agg("sum") != 0)
X_installs = X_installs.loc[filtro_installs]
y_installs = y_installs.loc[filtro_installs]"""

dtrain_installs = xgb.DMatrix(data=X_train_installs, label=y_train_installs)
dtest_installs = xgb.DMatrix(data=X_test_installs, label=y_test_installs)

---

# Entrenamiento - Auctions

In [10]:
space = {
        'n_estimators': hp.choice('n_estimators', np.arange(100, 500+1)),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 9, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 0.5, 6, 1),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 5,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}


def objective(space):
    model = xgb.train(space, dtrain_auctions,evals=[(dtest_auctions, "test")], verbose_eval=False, )
    pred = model.predict(dtest_auctions)
    rmse = np.sqrt(mean_squared_error(y_test_auctions, pred))
    return{'loss':rmse, 'status': STATUS_OK }


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=25,
            trials=trials)

print(best)
#100%|██████████| 10/10 [00:42<00:00,  5.18s/it, best loss: 62323.934330109085]
# {'colsample_bytree': 0.7000000000000001, 'eta': 0.35000000000000003, 'gamma': 0.5, 'max_depth': 6, 'min_child_weight': 6.0, 'n_estimators': 349}

100%|██████████| 25/25 [01:08<00:00,  3.81s/it, best loss: 59674.30289383396]
{'colsample_bytree': 0.7000000000000001, 'eta': 0.35000000000000003, 'gamma': 0.5, 'max_depth': 6, 'min_child_weight': 6.0, 'n_estimators': 349}


---

# Entrenamiento - installs

In [11]:
space = {
        'n_estimators': hp.choice('n_estimators', np.arange(100, 500+1)),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 9, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 0.5, 6, 1),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 5,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}


def objective(space):
    model = xgb.train(space, dtrain_installs, evals=[(dtest_installs, "test")], verbose_eval=False, )
    pred = model.predict(dtest_installs)
    rmse = np.sqrt(mean_squared_error(y_test_installs, pred))
    return{'loss':rmse, 'status': STATUS_OK }


trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print(best)
#100%|██████████| 64/64 [01:01<00:00,  1.05it/s, best loss: 75360.45606226214]
#{'colsample_bytree': 0.7000000000000001, 'eta': 0.42500000000000004, 'gamma': 0.75, 'max_depth': 1, 'min_child_weight': 6.0, 'n_estimators': 220}

100%|██████████| 100/100 [00:54<00:00,  1.74it/s, best loss: 74621.95048098643]
{'colsample_bytree': 0.65, 'eta': 0.5, 'gamma': 0.9, 'max_depth': 1, 'min_child_weight': 4.0, 'n_estimators': 69}


---

# Prediccion

### Cargo features de los dispositivos de la competencia

In [12]:
predecir_auctions = pd.read_csv("../../features/predecir_auctions_final.csv", index_col=0)
#predecir_auctions = predecir_auctions.reindex(sorted(predecir_auctions.columns), axis=1)

In [13]:
dpredecir_auctions = xgb.DMatrix(data=predecir_auctions)

In [14]:
predecir_installs = pd.read_csv("../../features/predecir_installs_final.csv", index_col=0)
#predecir_installs = predecir_installs.reindex(sorted(predecir_installs.columns), axis=1)

In [15]:
dpredecir_installs = xgb.DMatrix(data=predecir_installs)

---

In [16]:
dfinal_auctions = xgb.DMatrix(data=X_auctions, label=y_auctions)

In [17]:
# {'colsample_bytree': 0.7000000000000001, 'eta': 0.35000000000000003, 'gamma': 0.5, 'max_depth': 6, 'min_child_weight': 6.0, 'n_estimators': 349}
best_auctions = {
        'n_estimators': 349,
        'eta': 0.35,
        'max_depth': 6,
        'min_child_weight': 6.0,
        'gamma': 0.9,
        'colsample_bytree': 0.7,
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 6,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}
model_auctions = xgb.train(best_auctions, dfinal_auctions, verbose_eval=False)

In [18]:
predecir_auctions["label"] = model_auctions.predict(dpredecir_auctions)

---

In [19]:
dfinal_installs = xgb.DMatrix(data=X_installs, label=y_installs)

In [20]:
# {'colsample_bytree': 0.65, 'eta': 0.5, 'gamma': 0.9, 'max_depth': 1, 'min_child_weight': 4.0, 'n_estimators': 69}
best_installs = {
        'n_estimators': 69,
        'eta': 0.5,
        'max_depth': 1,
        'min_child_weight': 4.0,
        'gamma': 0.9,
        'colsample_bytree': 0.65,
        'eval_metric': 'rmse',
        'obj': 'reg:squarederror',
        'nthread': 6,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
}
model_installs = xgb.train(best_installs, dfinal_installs, verbose_eval=False)

In [21]:
predecir_installs["label"] = model_installs.predict(dpredecir_installs)

---

In [22]:
def corregir(x):
    if x > 259200:
        return 259200
    elif x < 0:
        return 0
    else:
        return x

In [23]:
predecir_auctions["label"] = predecir_auctions["label"].apply(lambda x: corregir(x))

In [24]:
predecir_installs["label"] = predecir_installs["label"].apply(lambda x: corregir(x))

---

### Cargo archivo de submissions

In [25]:
submissions = pd.read_csv("../../../../data/tp2/target.csv")

In [26]:
predecir_auctions.reset_index(inplace=True)

In [27]:
predecir_auctions["ref_hash"] = predecir_auctions["ref_hash"].apply(lambda x: str(x)+"_st")

In [28]:
predecir_installs.reset_index(inplace=True)

In [29]:
predecir_installs["ref_hash"] = predecir_installs["ref_hash"].apply(lambda x: str(x)+"_sc")

In [30]:
rename_dict_auctions = predecir_auctions.set_index('ref_hash').to_dict()['label']

In [31]:
rename_dict_installs = predecir_installs.set_index('ref_hash').to_dict()['label']

In [32]:
tmp = submissions.copy()

In [33]:
tmp["obj"] = tmp["ref_hash"]

In [34]:
tmp["obj"] = tmp["obj"].replace(rename_dict_auctions)

In [35]:
tmp["obj"] = tmp["obj"].replace(rename_dict_installs)

In [36]:
tmp.set_index("ref_hash").to_csv("hasta_v5_con_outer_join_solo_en_installs.csv")

---

# Persistencia de modelos

In [37]:
model_installs.save_model("hasta_v5_con_outer_join_solo_en_installs.model")

In [38]:
model_installs.dump_model("hasta_v5_con_outer_join_solo_en_installs.dump", with_stats=True)

---

In [39]:
model_auctions.save_model("hasta_v5_con_outer_join_solo_en_installs.model")

In [40]:
model_auctions.dump_model("hasta_v5_con_outer_join_solo_en_installs.dump", with_stats=True)

---