In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# read csv files to df
# two parts because two batches
simulation_result = pd.DataFrame()
file_prefix = 'first_big_sim_batch_'

batches = 90
batch_size = 3

for i in range(batches):
    df = pd.read_csv('results/' + file_prefix + str(i) + '.csv')
    df['subset'] = df['subset'] + i * batch_size

    simulation_result = pd.concat([simulation_result, df])
    
file_prefix = 'first_big_sim_batch_small_'
batches = 324
batches_start = 271
batch_size = 1

for i in range(batches_start, batches):
    df = pd.read_csv('results/' + file_prefix + str(i) + '.csv')
    df['subset'] = df['subset'] + i * batch_size

    simulation_result = pd.concat([simulation_result, df])

In [None]:
# recreate system_params
system_params = create_par_sweep(system_params)

In [None]:
# might be inverse? ~
df_kpi = simulation_result.groupby(['subset']).apply(lambda x: (x.Agg_APY < 0.05).any())

# join with params
df_kpi = df_kpi.reset_index()
# for every row of df_kpi, join column='key' with values system_params[key][x['subset']] 
for key in system_params.keys():
    df_kpi = df_kpi.join(df_kpi.apply(lambda x: system_params[key][x['subset']], axis = 1))


In [None]:
def analyze_success(X, Y, name):
    print("-"*50)
    print("KPI: {}".format(name))
    print("-"*50)
    print()
    print()
    # Fit model
    model = sm.Logit(Y, X).fit()
    
    # Find predictors
    predictors = pd.concat([model.params, model.pvalues], axis=1)
    predictors.columns = ['coef', 'p-value']
    predictors = predictors.drop(index='const')
    predictors = predictors[predictors['p-value'] < .05]
    print("Statistically Significant Predictors (Logistic Regression):")
    print(predictors)
    print()
    print()
    
    fig, axes = plt.subplots(nrows=2,
                             figsize=(15, 12),
                             gridspec_kw={'height_ratios': [3, 1]})

    model = DecisionTreeClassifier(max_depth=3)
    rf = RandomForestClassifier(max_depth=6)
    model.fit(X, Y)
    rf.fit(X, Y)

    importance = (pd.DataFrame(list(zip(rf.feature_names_in_, rf.feature_importances_)),
                           columns=['features', 'importance'])
              .sort_values(by='importance', ascending=False)
              )

    plot_tree(model,
                  rounded=True,
                  proportion=True,
                  fontsize=8,
                  feature_names=X.columns,
                  class_names=['Failure', 'Success'],
                  filled=True,
                  ax=axes[0])

    axes[0].set_title(
            f'Decision tree, score: {model.score(X, Y) :.0%}. N: {len(X) :.2e}')
    sns.barplot(data=importance,
                    x=importance.features,
                    y=importance.importance,
                    ax=axes[1],
                    label='small')
    plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=45)
    axes[1].set_title(f'Feature Importance')

    plt.show()
    
    print()
    print()
    
    variables = [a for a in X.columns if a != "const"]
    fig, axs = plt.subplots(nrows = 4, ncols = 4, figsize=(20,20))
    for i, var in enumerate(variables):
        ax = axs[i // 4, i%4]
        sns.kdeplot(kpis, x=var, hue=y, ax=ax, common_norm=False,
                   common_grid=False)
        ax.set_title("{} KDE by Success".format(var))
    plt.show()
    
    for _ in range(20):
        print()

In [None]:
X = kpis[['weekly_lock_prob','weekly_vote_success_prob','weekly_consume_multiple']]

# Scale to standard normal
X = (X - X.mean()) / X.std()
X['const'] = 1

y = ['Agg_APY']
Y = df_kpi[y]

In [None]:
analyze_success(X, Y, y)