In [1]:
from tqdm.autonotebook import tqdm
import logging



In [3]:
import pandas as pd
import numpy as np
import pycm

In [None]:
from sklearn import metrics as skmetrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.preprocessing import label_binarize, QuantileTransformer

In [None]:
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args

In [None]:
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_parquet("./dataset.parquet").sample(5000)
categorical_columns = ['partner', 'device', 'gender', 'state', 'channel']
feature_columns = [
    'channel', 'partner', 'device', 'age', 'gender', 'state', 'has_marketplace', 
    'has_crossdocking', 'has_private_label', 'has_brands', 'gmv', 'fst_sale_in_black_friday_days', 
    'snd_sale_in_black_friday_days'
]

In [None]:
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
df = df.loc[df.waiting_time > 0]

In [None]:
features = [c for c in df.columns if any([c.startswith(x) for x in feature_columns])]
X = df.loc[:, features]
y = df.loc[:, 'has_second_sale_within_year']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

qt = QuantileTransformer()
qt.fit(X_train.loc[:, ["age", "gmv"]])

X_train.loc[:, ["age", "gmv"]] = qt.transform(X_train.loc[:, ["age", "gmv"]])
X_test.loc[:, ["age", "gmv"]] = qt.transform(X_test.loc[:, ["age", "gmv"]])

In [None]:
HYPERPARAMETER_SPACE = [
    Integer(1, 10, name='min_child_weight'),
    Real(1e-5, 1.0, "uniform", name='learning_rate'),
    Real(0.1, 10, "log-uniform", name='gamma'),    
    Real(0.05, 1, name='subsample'),
    Real(0.05, 1, name='colsample_bytree'),
#     Integer(2, 6, name='max_depth'),
#     Integer(100, 200, name="n_estimators"),
    Real(0.01, 0.99, name="base_score"),
#     Real(0.9, 1,0, name="scale_pos_weight"),
    Real(0, 5, name="reg_alpha"),
    Real(0, 5, name="reg_lambda"),
]

HYPERPARAMETER_NAMES = [dim.name for dim in HYPERPARAMETER_SPACE]

In [None]:
RANDOM_STATE = 939568576
TREE_METHOD = 'hist'#'exact'
NUM_PROCESSORS = 8
NUM_CROSS_VALIDATION_STEPS = 1
NUM_GP_OPTIMIZATION_STEPS = 20

In [None]:
def get_logger(name):
    logger = logging.getLogger(name)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    stream_handler = logging.StreamHandler()
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)
    logger.setLevel(logging.INFO)
    return logger

logger = get_logger(__name__)

In [None]:
def train_model(X, y, **kwargs):
    model = XGBClassifier(
        max_depth=5,
        n_estimators=1000,
        scale_pos_weight=1,
        random_state=RANDOM_STATE,
        tree_method=TREE_METHOD,
        n_jobs=NUM_PROCESSORS,
        **kwargs,
    )
    model.fit(X, y)
    return model

def evaluate_model(X, y, **kwargs):
    roc_auc = []
    for _ in range(NUM_CROSS_VALIDATION_STEPS):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        model = train_model(X_train, y_train, **kwargs)
        score = model.predict_proba(X_test)
        #total_correlation = np.sqrt(1 - np.exp(-2 * mutual_info_classif(score, y_test)))
        log_like = np.log(score[range(y_test.size), y_test]).mean()
        roc_auc.append(log_like)
    logger.info(f"Values: {roc_auc}")
    return np.mean(roc_auc)


def train_optimized_model(X, y):
        
    @use_named_args(HYPERPARAMETER_SPACE)
    def loss(**kwargs):
        return -evaluate_model(X, y, **kwargs)

    def get_optimization_callback():
        message = "Hyperparameter optimization iteration {:d}/{:d}. Current: {:5.3f}. Best: {:5.3f}."
            #" Parameters:\n{}"
        def callback(res):          
            current_value = -res.func_vals[-1]
            best_value = -res.func_vals.min()            
            current_params = pd.Series(dict(zip(
                HYPERPARAMETER_NAMES, 
                map(fix_type, res.x)
            )))
            logger.info(message.format(
                len(res.func_vals),
                NUM_GP_OPTIMIZATION_STEPS,
                current_value,
                best_value,
                #current_params
            ))

        return callback

    def fix_type(x):
        if isinstance(x, bool):
            return x
        elif np.issubdtype(np.dtype(x), np.int_):
            return int(x)
        elif np.issubdtype(np.dtype(x), np.float_):
            return float(x)

    logger.info("Will start hyperparameter optimization.")
    optimization_results = gp_minimize(
        loss,
        HYPERPARAMETER_SPACE,
        n_random_starts=max(1, min(10, int(NUM_GP_OPTIMIZATION_STEPS / 2))),
        n_calls=NUM_GP_OPTIMIZATION_STEPS,
        random_state=RANDOM_STATE,
        callback=get_optimization_callback(),
    )
    logger.info("Finished hyperparameter optimization.")
    best_parameters = dict(
        zip(HYPERPARAMETER_NAMES, map(fix_type, optimization_results.x))
    )
    logger.info("Starting training of final model.")
    return train_model(X, y, verbosity=1, **best_parameters)


In [None]:
%time best_model = train_optimized_model(X_train, y_train)

In [None]:
y_pred = best_model.predict(X_test)
s_pred = best_model.predict_proba(X_test)

print(skmetrics.classification_report(y_test, y_pred))

In [None]:
cm = pycm.ConfusionMatrix(actual_vector=y_test.values, predict_vector=y_pred)
cm.print_normalized_matrix()

In [None]:
fpr, tpr, roc_thres = skmetrics.roc_curve(y_test, s_pred)

In [None]:
fig = plt.figure(figsize=(12, 4))

ax = fig.add_subplot(121)
ax.set_title("ROC Curve")
ax.plot(fpr, tpr)

ax = fig.add_subplot(122)
ax.set_title("Positive Rates")
ax.plot(roc_thres, tpr, label="TPR")
ax.plot(roc_thres, fpr, label="FPR")
ax.set_xlim(0, 1)
ax.legend()

fig.tight_layout()

In [None]:
total_correlation = np.sqrt(1 - np.exp(-2 * mutual_info_classif(s_pred, y_test))).min()
print(total_correlation)

In [None]:
skmetrics.roc_auc_score(label_binarize(y_test, range(len(code2period))), s_pred, average='micro')

In [None]:
skmetrics.roc_auc_score(label_binarize(y_test, range(len(code2period))), s_pred, average='macro')

In [None]:
n_points = min(100, np.unique(s_pred).shape[0])

In [None]:
thresholds = np.linspace(s_pred.min(), s_pred.max()*0.999, n_points)

metrics = pd.DataFrame([
    skmetrics.precision_recall_fscore_support(y_test, (s_pred >= threshold), average='binary')
    for threshold in thresholds
], columns=["precision", "recall", "f1", "support"], index=thresholds)

optimal_threshold = metrics.f1.argmax()


In [None]:
fig = plt.figure(figsize=(12, 4))

ax = fig.add_subplot(121)
ax.set_title("Precision x Recall Curve")
ax.plot(metrics.precision, metrics.recall)

ax = fig.add_subplot(122)
ax.set_title("Positive Rates")
ax.plot(metrics.index, metrics.precision, label="Precision")
ax.plot(metrics.index, metrics.recall, label="Recall")
ax.plot(metrics.index, metrics.f1, label="F_1 Score")
ax.axvline(optimal_threshold)
ax.legend()

fig.tight_layout()

In [None]:
optimal_threshold = metrics.f1.argmax()
print(metrics.loc[optimal_threshold])
y_pred = (s_pred > optimal_threshold).astype(int)

In [None]:
print(skmetrics.classification_report(y_test, y_pred))

In [None]:
best_model.save_artifact("./within_year_model.pred")