In [17]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from functools import partial

from sklearn.datasets import make_hastie_10_2, make_classification
from sklearn.model_selection import GridSearchCV, cross_validate, KFold
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, f1_score, precision_score, roc_auc_score, average_precision_score, recall_score, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier

# Metrics to measure
- Overall F1 Score
- Precision @0.1 increments of probability
- Recall @0.1 increments of probability
- Count trades @0.1 increments of probability
- Mean trade returns @0.1 increments of probability
- 25/50/75 percentile trade returns @0.1 increments of probability
- min/max @0.1 increments of probability
- cum prod returns (from 1.0 to 0.0) @0.1 increments of probability (based on last value in window)
- threshold that reflects max cum prod returns

In [18]:
X, y = make_classification(n_samples=2000, n_features=10, n_informative=5, n_classes=3, )

In [19]:
col_names = ["gfs", "gh", "lo", "we", "cv", "az", "my", "qd", "xf", "ie"]

In [20]:
X = pd.DataFrame(X, columns=col_names)
y = pd.Series(y)

In [21]:
y

0       1
1       1
2       2
3       0
4       0
       ..
1995    2
1996    0
1997    1
1998    1
1999    0
Length: 2000, dtype: int64

In [22]:
returns = pd.DataFrame(np.random.random(size=len(X))/10 - 0.05, columns=["returns"], index=X.index)
returns

Unnamed: 0,returns
0,-0.017060
1,0.010479
2,-0.013460
3,0.043824
4,-0.008341
...,...
1995,-0.043810
1996,-0.002460
1997,0.023086
1998,0.028992


In [50]:
returns_df = pd.DataFrame(index=returns.index)
returns_df["returns_long"] = returns + 1.0
returns_df["returns_short"] = -returns + 1.0
returns_df

Unnamed: 0,returns_long,returns_short
0,0.982940,1.017060
1,1.010479,0.989521
2,0.986540,1.013460
3,1.043824,0.956176
4,0.991659,1.008341
...,...,...
1995,0.956190,1.043810
1996,0.997540,1.002460
1997,1.023086,0.976914
1998,1.028992,0.971008


In [24]:
y = y.map({0: "long", 1: "short", 2: "loss"})
y

0       short
1       short
2        loss
3        long
4        long
        ...  
1995     loss
1996     long
1997    short
1998    short
1999     long
Length: 2000, dtype: object

In [127]:
def get_mc_describe_returns(estimator, X, y, returns_df, title):
    classes_to_describe = [c for c in estimator.classes_ if c not in ["loss", "not_minmax_slow"]]
    for idx, c in enumerate(classes_to_describe):
        X_copy = X.copy()
        X_copy["pred_proba"] = estimator.predict_proba(X)[:, idx]

        if "short" in c: return_col = "returns_short"
        else: return_col = "returns_long"

        returns = returns_df.loc[X_copy.index, return_col]
        bins = np.linspace(start=0, stop=1, num=11)
        stats = returns.groupby(pd.cut(X_copy["pred_proba"], bins=bins)).agg("describe")
        print(f"Trade Returns by Threshold: {title} ({c})")
        print(stats)

    return 0


def get_mc_describe_cumprod_returns(estimator, X, y, returns_df, title):
    classes_to_describe = [c for c in estimator.classes_ if c not in ["loss", "not_minmax_slow"]]
    for idx, c in enumerate(classes_to_describe):
        X_copy = X.copy()
        X_copy["pred_proba"] = estimator.predict_proba(X)[:, idx]
        X_copy = X_copy.sort_values(by="pred_proba", ascending=False)

        if "short" in c: return_col = "returns_short"
        else: return_col = "returns_long"

        returns = returns_df.loc[X_copy.index, return_col]
        X_copy["cumprod_returns"] = returns.cumprod()

        bins = np.linspace(start=0, stop=1, num=11)
        stats = X_copy["cumprod_returns"].groupby(
            pd.cut(X_copy["pred_proba"], bins=bins)).agg("describe")
        print(f"Cumulative Product Returns by Threshold: {title} ({c})")
        print(stats)

    return 0

def get_mc_threshold_max_cumprod_returns(estimator, X, y, returns_df, target):
    classes_to_describe = [c for c in estimator.classes_]
    for idx, c in enumerate(classes_to_describe):
    
        if target == c:
            X_copy = X.copy()
            X_copy["pred_proba"] = estimator.predict_proba(X_copy)[:, idx]
            X_copy = X_copy.sort_values(by="pred_proba", ascending=False)

            if "short" in c:
                return_col = "returns_short"
            else:
                return_col = "returns_long"

            returns = returns_df.loc[X_copy.index, return_col]
            X_copy["cumprod_returns"] = returns.cumprod()

            max_cumprod_returns_idx = X_copy["cumprod_returns"].argmax()
            max_cumprod_returns_threshold = X_copy.iloc[max_cumprod_returns_idx]["pred_proba"]

            return max_cumprod_returns_threshold
        else:
            return -999

In [87]:
def get_mc_describe_returns(estimator, X, y, returns, title):
    classes_to_describe = [c for c in estimator.classes_ if c not in ["loss", "not_minmax_fast"]]
    for idx, c in enumerate(classes_to_describe):
    
        X_copy = X.copy()
        X_copy["pred_proba"] = estimator.predict_proba(X)[:, idx]
        returns = returns.loc[X_copy.index]
        bins = np.linspace(start=0, stop=1, num=11)
        stats = returns["returns"].groupby(pd.cut(X_copy["pred_proba"], bins=bins)).agg("describe")
        print(f"Trade Returns by Threshold: {title} ({c})")
        print(stats)
    return 0

In [88]:
def get_mc_describe_precision_recall(estimator, X, y, title):
    classes_to_describe = [c for c in estimator.classes_ if "loss" not in c]
    for idx, c in enumerate(classes_to_describe):
    
        precision, recall, thresholds = precision_recall_curve(
            y, estimator.predict_proba(X)[:, idx], pos_label=c)

        pr_summary = np.column_stack([precision, recall, np.append(thresholds, [1])])
        pr_summary_df = pd.DataFrame(pr_summary, columns=["precision", "recall", "thresholds"])
        bins = np.linspace(start=0, stop=1, num=11)

        pr_agg_df = pr_summary_df[["precision", "recall"]].groupby(
            pd.cut(pr_summary_df["thresholds"], bins=bins)).agg(["mean", "median", "std"])
        print(f"Precision Recall by Threshold: {title} ({c})")
        print(pr_agg_df)
    
    return 0

In [89]:
def get_describe_cumprod_returns(estimator, X, y, returns, title):
    classes_to_describe = [c for c in estimator.classes_ if "loss" not in c]
    for idx, c in enumerate(classes_to_describe):
    
        X_copy = X.copy()
        X_copy["pred_proba"] = estimator.predict_proba(X)[:, idx]
        X_copy = X_copy.sort_values(by="pred_proba", ascending=False)

        returns = returns.loc[X_copy.index]
        X_copy["cumprod_returns"] = returns.cumprod()

        bins = np.linspace(start=0, stop=1, num=11)
        stats = X_copy["cumprod_returns"].groupby(pd.cut(X_copy["pred_proba"], bins=bins)).agg("describe")
        print(f"Cumulative Product Returns by Threshold: {title} ({c})")
        print(stats)
    return 0

In [128]:
mc_scores = {
    "EmptyMCDescribeCumProdReturns": partial(get_mc_describe_cumprod_returns, returns_df=returns_df, title="SOLUSDT Short"),
    "EmptyMCDescribeReturns": partial(get_mc_describe_returns, returns_df=returns_df, title="SOLUSDT Short"),
    "ThresholdMCMaxCumProdReturns": partial(get_mc_threshold_max_cumprod_returns, returns_df=returns_df, target="longg"),
}

In [129]:
cv = KFold(2)
clf = RandomForestClassifier(random_state=42)
cross_validate(estimator=clf, X=X, y=y, scoring=mc_scores, cv=cv)

Cumulative Product Returns by Threshold: SOLUSDT Short (long)
            count      mean       std       min       25%       50%       75%  \
pred_proba                                                                      
(0.0, 0.1]  395.0  1.162776  0.349298  0.529383  0.908287  1.126604  1.509932   
(0.1, 0.2]   82.0  1.273692  0.110775  1.063852  1.196619  1.274764  1.326349   
(0.2, 0.3]   37.0  1.020721  0.114379  0.865802  0.923170  0.992211  1.118049   
(0.3, 0.4]   21.0  0.904937  0.043780  0.832072  0.871144  0.913588  0.939418   
(0.4, 0.5]   35.0  0.814303  0.041103  0.748108  0.785038  0.811963  0.841462   
(0.5, 0.6]   40.0  0.767863  0.066726  0.660460  0.712007  0.757699  0.798547   
(0.6, 0.7]   45.0  0.858422  0.042770  0.771521  0.827509  0.848032  0.892382   
(0.7, 0.8]   70.0  0.851190  0.053176  0.754614  0.812915  0.850643  0.887250   
(0.8, 0.9]  108.0  0.926892  0.082696  0.740576  0.886545  0.933136  0.990009   
(0.9, 1.0]   79.0  1.009849  0.051966  0.927921

{'fit_time': array([0.16421294, 0.14098001]),
 'score_time': array([0.08299589, 0.07940412]),
 'test_EmptyMCDescribeCumProdReturns': array([0, 0]),
 'test_EmptyMCDescribeReturns': array([0, 0]),
 'test_ThresholdMCMaxCumProdReturns': array([-999, -999])}

# Binary Class

In [None]:
def get_precision_at_threshold(estimator, X, y, desired_threshold):
    pos_label = np.sort(y.unique())[0]
    precision, recall, thresholds = precision_recall_curve(
        y, estimator.predict_proba(X)[:, 0], pos_label=pos_label)
    
    desired_threshold_idx = np.argmax(thresholds >= desired_threshold)

    return precision[desired_threshold_idx]

In [None]:
def get_recall_at_threshold(estimator, X, y, desired_threshold):
    pos_label = np.sort(y.unique())[0]
    precision, recall, thresholds = precision_recall_curve(
        y, estimator.predict_proba(X)[:, 0], pos_label=pos_label)
    
    desired_threshold_idx = np.argmax(thresholds >= desired_threshold)

    return recall[desired_threshold_idx]

In [None]:
def get_count(estimator, X, y, lower, upper):
    X_copy = X.copy()
    X_copy["pred_proba"] = estimator.predict_proba(X)[:, 0]
    idx = (X_copy["pred_proba"] >= lower) & (X_copy["pred_proba"] <= upper)
    count = np.sum(idx)

    return count

In [None]:
def get_mean_returns(estimator, X, y, returns, lower, upper):
    X_copy = X.copy()
    X_copy["pred_proba"] = estimator.predict_proba(X)[:, 0]
    returns = returns.loc[X_copy.index]
    idx = (X_copy["pred_proba"] >= lower) & (X_copy["pred_proba"] <= upper)
    return returns.loc[idx, "returns"].mean()

In [None]:
def get_prod_returns(estimator, X, y, returns, lower, upper):
    X_copy = X.copy()
    X_copy["pred_proba"] = estimator.predict_proba(X)[:, 0]
    returns = returns.loc[X_copy.index]
    idx = (X_copy["pred_proba"] >= lower) & (X_copy["pred_proba"] <= upper)
    return returns.loc[idx, "returns"].product()

In [None]:
def get_threshold_max_cumprod_returns(estimator, X, y, returns):
    X_copy = X.copy()
    X_copy["pred_proba"] = estimator.predict_proba(X)[:, 0]
    returns = returns.loc[X_copy.index]
    X_copy = X_copy.sort_values(by="pred_proba", ascending=False)
    X_copy["cumprod_returns"] = returns["returns"].cumprod()
    max_cumprod_returns_idx = X_copy["cumprod_returns"].argmax()
    
    return X_copy.iloc[max_cumprod_returns_idx]["pred_proba"]

In [None]:
def get_value_max_cumprod_returns(estimator, X, y, returns):
    X_copy = X.copy()
    X_copy["pred_proba"] = estimator.predict_proba(X)[:, 0]
    returns = returns.loc[X_copy.index]
    X_copy = X_copy.sort_values(by="pred_proba", ascending=False)
    X_copy["cumprod_returns"] = returns["returns"].cumprod()
    max_cumprod_returns_idx = X_copy["cumprod_returns"].argmax()
    
    return X_copy.iloc[max_cumprod_returns_idx]["cumprod_returns"]

In [None]:
def get_describe_returns(estimator, X, y, returns, title):
    pos_label = np.sort(y.unique())[0]
    X_copy = X.copy()
    X_copy["pred_proba"] = estimator.predict_proba(X)[:, 0]
    returns = returns.loc[X_copy.index]
    bins = np.linspace(start=0, stop=1, num=11)
    stats = returns["returns"].groupby(pd.cut(X_copy["pred_proba"], bins=bins)).agg("describe")
    print(f"Trade Returns by Threshold: {title} ({pos_label})")
    print(stats)
    return 0

In [None]:
def get_describe_precision_recall(estimator, X, y, title):
    pos_label = np.sort(y.unique())[0]
    precision, recall, thresholds = precision_recall_curve(
        y, estimator.predict_proba(X)[:, 0], pos_label=pos_label)
    
    pr_summary = np.column_stack([precision, recall, np.append(thresholds, [1])])
    pr_summary_df = pd.DataFrame(pr_summary, columns=["precision", "recall", "thresholds"])
    bins = np.linspace(start=0, stop=1, num=11)
    
    pr_agg_df = pr_summary_df[["precision", "recall"]].groupby(
        pd.cut(pr_summary_df["thresholds"], bins=bins)).agg(["mean", "median", "std"])
    print(f"Precision Recall by Threshold: {title} ({pos_label})")
    print(pr_agg_df)
    
    return 0

In [None]:
test_scores = {
    "Precision[0.25-1.0]": partial(get_precision_at_threshold, desired_threshold=0.25),
    "Precision[0.50-1.0]": partial(get_precision_at_threshold, desired_threshold=0.50),
    "Precision[0.75-1.0]": partial(get_precision_at_threshold, desired_threshold=0.75),
    "Recall[0.25-1.0]": partial(get_recall_at_threshold, desired_threshold=0.25),
    "Recall[0.50-1.0]": partial(get_recall_at_threshold, desired_threshold=0.50),
    "Recall[0.75-1.0]": partial(get_recall_at_threshold, desired_threshold=0.75),
    "Count[0.25-1.0]": partial(get_count, lower=0.25, upper=1.0),
    "Count[0.50-1.0]": partial(get_count, lower=0.50, upper=1.0),
    "Count[0.75-1.0]": partial(get_count, lower=0.75, upper=1.0),
    "AvgReturns[0.25-1.0]": partial(get_mean_returns, returns=returns, lower=0.25, upper=1.0),
    "AvgReturns[0.50-1.0]": partial(get_mean_returns, returns=returns, lower=0.50, upper=1.0),
    "AvgReturns[0.75-1.0]": partial(get_mean_returns, returns=returns, lower=0.75, upper=1.0),
    "ProdReturns[0.25-1.0]": partial(get_prod_returns, returns=returns, lower=0.25, upper=1.0),
    "ProdReturns[0.50-1.0]": partial(get_prod_returns, returns=returns, lower=0.50, upper=1.0),
    "ProdReturns[0.75-1.0]": partial(get_prod_returns, returns=returns, lower=0.75, upper=1.0),
    "ThresholdMaxCumProdReturns": partial(get_threshold_max_cumprod_returns, returns=returns),
    "ValueMaxCumProdReturns": partial(get_value_max_cumprod_returns, returns=returns),
    "EmptyDescribeReturns": partial(get_describe_returns, returns=returns, title="SOLUSDT Short"),
    "EmptyDescribePrecisionRecall": partial(get_describe_precision_recall, title="SOLUSD Short")
}

In [13]:
cv = KFold(5)
clf = RandomForestClassifier()
cross_validate(estimator=clf, X=X, y=y, scoring=test_scores, cv=cv)

NameError: name 'test_scores' is not defined

In [14]:
clf.fit(X, y)

In [15]:
clf.classes_

array(['long', 'loss', 'short'], dtype=object)

In [None]:
bins = np.linspace(start=0, stop=1, num=11)
stats = X["returns"].groupby(pd.cut(X["gfs"], bins=bins)).mean()

In [None]:
stats

In [None]:
X.sort_values(by="gfs").expanding().mean()

In [None]:
X.sort_values(by="gfs")

In [None]:
d = pd.Series(["trtr", "tr", "hy"], name="n")

In [None]:
d.map({"tr": 1})

In [None]:
l = ["a_win", "long_loss"]

In [None]:
np.sort(y.unique())[0]

In [None]:
{v: i for i, v in enumerate(np.flip(np.sort(d.unique())))}

In [None]:
np.flip(np.sort(d.unique()))

In [None]:
X["returns"].product()

In [134]:
t = np.array([-999, 20, -999])

In [135]:
np.mean([t for t in t if t != -999])

20.0

In [136]:
if -999 in t:
    print("y")

y
