In [None]:
import numpy as np
import pandas as pd
# https://www.kaggle.com/c/meta-kaggle
comps = pd.read_csv("/kaggle/input/meta-kaggle/Competitions.csv")
evaluation = [
    "EvaluationAlgorithmAbbreviation",
    "EvaluationAlgorithmName",
    "EvaluationAlgorithmDescription",
]
compt = ["Title", "EnabledDate", "HostSegmentTitle"]
df = comps[compt + evaluation].copy()
df["year"] = pd.to_datetime(df.EnabledDate).dt.year.values
df["comps"] = 1
time_select = df.year >= 2015
competition_type_select = df.HostSegmentTitle.isin(["Featured", "Research"])
pd.pivot_table(
    df[time_select & competition_type_select],
    values="comps",
    index=["EvaluationAlgorithmAbbreviation"],
    columns=["year"],
    fill_value=0.0,
    aggfunc=np.sum,
    margins=True,
).sort_values(by=("All"), ascending=False).iloc[1:, :].head(20)

In [None]:
df.head()

In [None]:
metric = 'AUC'
metric_select = df['EvaluationAlgorithmAbbreviation']==metric
print(df[time_select&competition_type_select&metric_select]
[['Title', 'year']])

In [None]:
counts = (df[time_select&competition_type_select].groupby('EvaluationAlgorithmAbbreviation'))
total_comps_per_year = (df[time_select&competition_type_select].groupby('year').sum())
single_metrics_per_year = (counts.sum()[counts.sum().comps==1].groupby('year').count())
single_metrics_per_year.head(5)
table = (total_comps_per_year.rename(columns={'comps': 'n_comps'})
.join(single_metrics_per_year['comps'] / total_comps_per_year['comps'])
.rename(columns={'comps': 'pct_comps'}))

print(table[['n_comps','pct_comps']])

In [None]:
print(counts.sum()[counts.sum().comps==1].index.values)

# Classificatio code

In [None]:
sklearn.metrics.confusion_matrix(
y_true, y_pred, *, labels=None, sample_weight=None,
normalize=None
)

## Function that you can use as a model for your own custom functions

In [None]:
from scipy.misc import derivative
import xgboost as xgb
def focal_loss(alpha, gamma):
    def loss_func(y_pred, y_true):
        a, g = alpha, gamma
            def get_loss(y_pred, y_true):
            p = 1 / (1 + np.exp(-y_pred))
            loss = (-(a * y_true + (1 - a)*(1 - y_true)) *
                ((1 - (y_true * p + (1 - y_true) *
                (1 - p)))**g) * (y_true * np.log(p) +
                (1 - y_true) * np.log(1 - p)))
            return loss
        partial_focal = lambda y_pred: get_loss(y_pred, y_true)
        grad = derivative(partial_focal, y_pred, n=1, dx=1e-6)
        hess = derivative(partial_focal, y_pred, n=2, dx=1e-6)
        return grad, hess
    return loss_func

xgb = xgb.XGBClassifier(objective=focal_loss(alpha=0.25, gamma=1))

### The solution, when your predicted probabilities are misaligned with the training distribution of the target, is to use the calibration function provided by Scikit-learn, CalibratedClassifierCV:

In [None]:
sklearn.calibration.CalibratedClassifierCV(base_estimator=None, *,
method='sigmoid', cv=None, n_jobs=None, ensemble=True)

# Chapter 6 - Designing Good Validation

## Adversarial validation

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
train = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-jan-2021/test.csv")

In [None]:
train = train.fillna(-1).drop(["id", "target"], axis=1)
test = test.fillna(-1).drop("id", axis=1)
X = pd.concat([train, test], ignore_index=True)
y = [0] * len(train) + [1] * len(test)

In [None]:
X.head()

In [None]:
model = RandomForestClassifier()
cv_preds = cross_val_predict(model, X, y, cv=5, n_jobs=-1,
method='predict_proba')

In [None]:
print(roc_auc_score(y_true=y, y_score=cv_preds[:,1]))

# The Tabular Playground Series