In [17]:
# USE 'BASE' ENVIRONMENT

import numpy as np
import pandas as pd
# from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import fisher_exact
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [18]:
input_path = r"../scores/taus/" + f'tau_0_05/scores_11_stocks_0_05_iter_1.csv'
df_1 = pd.read_csv(input_path, index_col = 0)
df_1

input_path = r"../scores/taus/" + f'tau_0_05/scores_11_stocks_0_05_iter_2.csv'
df_2 = pd.read_csv(input_path, index_col = 0)
df_2

df_concated = pd.concat([df_1, df_2])

df_concated

Unnamed: 0,clayton_random+gauss_dist+val,clayton_random+gauss_dist+score,clayton_random+t_dist+val,clayton_random+t_dist+scor,gaussian+gauss_dist+val,gaussian+gauss_dist+score,gaussian+t_dist+val,gaussian+t_dist+scor,t_student+gauss_dist+val,t_student+gauss_dist+score,t_student+t_dist+val,t_student+t_dist+scor
0,0.005712,0.000273,0.005414,0.000263,0.002106,0.000228,0.001827,0.000219,0.001616,0.000215,0.001418,0.000208
1,0.003400,0.000282,0.003240,0.000261,0.000979,0.000240,0.000806,0.000220,0.000525,0.000219,0.000433,0.000202
2,0.003057,0.000332,0.002739,0.000311,0.000068,0.000290,-0.000181,0.000271,0.000017,0.000267,-0.000170,0.000250
3,0.001699,0.000354,0.001528,0.000330,-0.000568,0.000326,-0.000586,0.000306,-0.000823,0.000321,-0.000798,0.000302
4,0.002083,0.000397,0.001678,0.000365,-0.000713,0.000352,-0.000860,0.000328,-0.000903,0.000338,-0.001011,0.000316
...,...,...,...,...,...,...,...,...,...,...,...,...
59,0.001238,0.000514,0.001052,0.000514,-0.000256,0.000520,-0.000433,0.000519,-0.000529,0.000493,-0.000679,0.000493
60,0.001016,0.000518,0.000759,0.000521,-0.000204,0.000513,-0.000450,0.000516,-0.000257,0.000495,-0.000479,0.000499
61,0.000819,0.000507,0.000612,0.000510,-0.000510,0.000505,-0.000719,0.000508,-0.000382,0.000504,-0.000569,0.000507
62,0.001510,0.000538,0.001193,0.000542,-0.000338,0.000531,-0.000640,0.000534,-0.000388,0.000505,-0.000646,0.000509


In [19]:
def get_taus_complex_models_scores(taus):
    scores = {}
    for tau in taus:
        tau_dfs_list = []
        for iter in range(1, 11):
            tau_str = str(tau).replace(".", "_")
            input_path = r"../scores/taus/" + f'tau_{tau_str}/scores_11_stocks_{tau_str}_iter_{iter}.csv'
            df = pd.read_csv(input_path, index_col = 0)
            tau_dfs_list.append(df)
        concatenated_df = pd.concat(tau_dfs_list)
        scores[tau] = concatenated_df
    return scores

def get_multivariate_scores_dfs(taus):
    scores = {}
    for tau in taus:
        tau_dfs_list = []
        for iter in range(1, 11):
            tau_str = str(tau).replace(".", "_")
            input_path = r"multivariate_scores/taus/" + f'tau_{tau_str}/multivariate_dists_scores_11_stocks_{tau_str}_iter_{iter}.csv'
            df = pd.read_csv(input_path, index_col = 0)
            tau_dfs_list.append(df)
        concatenated_df = pd.concat(tau_dfs_list)
        scores[tau] = concatenated_df
    return scores

def get_test_results(data, lower_bound, upper_bound, p_value_threshold):
    val_columns = [col for col in data.columns if "val" in col]

    results = []
    for col in val_columns:
        in_range_count = ((data[col] >= lower_bound) & (data[col] <= upper_bound)).sum()
        total_count = len(data[col])
        observed_proportion = in_range_count / total_count
        expected_proportion = 1 - p_value_threshold
        percentile_2_5= data[col].quantile(0.025)
        percentile_97_5 = data[col].quantile(0.975)
        results.append({
            "Column": col,
            "Observed Proportion": observed_proportion,
            "Percentile 2.5": percentile_2_5,
            "Percentile 97.5": percentile_97_5,
            "Reject H0": observed_proportion < expected_proportion
        })
    results_df = pd.DataFrame(results)
    return results_df

# def get_test_results_with_fisher(data, lower_bound, upper_bound, p_value_threshold):
#     val_columns = [col for col in data.columns if "val" in col]

#     results = []
#     for col in val_columns:
#         in_range_count = ((data[col] >= lower_bound) & (data[col] <= upper_bound)).sum()
#         total_count = len(data[col])
#         out_of_range_count = total_count - in_range_count

#         contingency_table = [[in_range_count, out_of_range_count],
#                              [total_count, 0]]

#         _, p_value = fisher_exact(contingency_table, alternative='two-sided')

#         observed_proportion = in_range_count / total_count
#         results.append({
#             "Column": col,
#             "Observed Proportion": observed_proportion,
#             "P-Value": p_value,
#             "Reject H0": p_value < p_value_threshold
#         })

#     results_df = pd.DataFrame(results)
#     return results_df

In [21]:
taus = [0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2]

In [42]:

scores = get_taus_complex_models_scores(taus)
multivariate_scores = get_multivariate_scores_dfs(taus)

a, b = -0.005, 0.005
p_value_threshold = 0.05


for tau in taus:
    tau_score = scores[tau]
    results = get_test_results(tau_score, a, b, p_value_threshold)

    multivariate_tau_score = multivariate_scores[tau]
    multivariate_results = get_test_results(multivariate_tau_score, a, b, p_value_threshold)

    print(f"Tau: {tau}\n")
    print(results)
    print("\n")
    print(multivariate_results)
    print("\n")






Tau: 0.0005

                          Column  Observed Proportion  Percentile 2.5  \
0  clayton_random+gauss_dist+val             0.948438       -0.000036   
1      clayton_random+t_dist+val             0.979688       -0.000036   
2        gaussian+gauss_dist+val             1.000000       -0.000081   
3            gaussian+t_dist+val             1.000000       -0.000081   
4       t_student+gauss_dist+val             1.000000       -0.000068   
5           t_student+t_dist+val             1.000000       -0.000068   

   Percentile 97.5  Reject H0  
0         0.005631       True  
1         0.004878      False  
2         0.002742      False  
3         0.002190      False  
4         0.002795      False  
5         0.002310      False  


           Column  Observed Proportion  Percentile 2.5  Percentile 97.5  \
0  gauss_dist+val             0.981250        0.000149         0.003930   
1      t_dist+val             0.334375        0.002826         0.010869   

   Reject H0  
0      F

In [43]:

def get_models_df(taus, a, b, p_value_threshold):
    complex_model_results = []
    multivariate_model_results = []
    for tau in taus:
        tau_score = scores[tau]
        results = get_test_results(tau_score, a, b, p_value_threshold)
        results["Tau"] = tau
        complex_model_results.append(results)

        multivariate_tau_score = multivariate_scores[tau]
        multivariate_results = get_test_results(multivariate_tau_score, a, b, p_value_threshold)
        multivariate_results["Tau"] = tau
        multivariate_model_results.append(multivariate_results)
    complex_model_df = pd.concat(complex_model_results, ignore_index=True)
    multivariate_model_df = pd.concat(multivariate_model_results, ignore_index=True)
    return complex_model_df, multivariate_model_df

def summarize_results_with_totals(df, model_type):
    counts = (
        df[df["Reject H0"] == False]
        .groupby(["Tau", "Column"])
        .size()
        .reset_index(name="Count")
    )

    all_models = df["Column"].unique()
    all_taus = df["Tau"].unique()
    full_index = pd.MultiIndex.from_product(
        [all_taus, all_models],
        names=["Tau", "Column"]
    )
    counts = counts.set_index(["Tau", "Column"]).reindex(full_index, fill_value=0).reset_index()
    total_counts = counts.groupby("Column")["Count"].sum().reset_index(name="Total Count")

    total_counts["Model Type"] = model_type
    return total_counts




In [59]:
a, b = -0.003, 0.003
p_value_threshold = 0.05

complex_model_df, multivariate_model_df = get_models_df(taus, a, b, p_value_threshold)
complex_model_summary = summarize_results_with_totals(complex_model_df, "Complex Models")
multivariate_model_summary = summarize_results_with_totals(multivariate_model_df, "Multivariate Models")


final_summary = pd.concat([complex_model_summary, multivariate_model_summary], ignore_index=True)
final_summary

Unnamed: 0,Column,Total Count,Model Type
0,clayton_random+gauss_dist+val,0,Complex Models
1,clayton_random+t_dist+val,0,Complex Models
2,gaussian+gauss_dist+val,9,Complex Models
3,gaussian+t_dist+val,8,Complex Models
4,t_student+gauss_dist+val,8,Complex Models
5,t_student+t_dist+val,7,Complex Models
6,gauss_dist+val,0,Multivariate Models
7,t_dist+val,0,Multivariate Models


In [58]:
a, b = -0.001, 0.003
p_value_threshold = 0.05

complex_model_df, multivariate_model_df = get_models_df(taus, a, b, p_value_threshold)
complex_model_summary = summarize_results_with_totals(complex_model_df, "Complex Models")
multivariate_model_summary = summarize_results_with_totals(multivariate_model_df, "Multivariate Models")


final_summary = pd.concat([complex_model_summary, multivariate_model_summary], ignore_index=True)
final_summary

Unnamed: 0,Column,Total Count,Model Type
0,clayton_random+gauss_dist+val,0,Complex Models
1,clayton_random+t_dist+val,0,Complex Models
2,gaussian+gauss_dist+val,4,Complex Models
3,gaussian+t_dist+val,4,Complex Models
4,t_student+gauss_dist+val,4,Complex Models
5,t_student+t_dist+val,4,Complex Models
6,gauss_dist+val,0,Multivariate Models
7,t_dist+val,0,Multivariate Models


In [64]:
a, b = -0.004, 0.001
p_value_threshold = 0.05

complex_model_df, multivariate_model_df = get_models_df(taus, a, b, p_value_threshold)
complex_model_summary = summarize_results_with_totals(complex_model_df, "Complex Models")
multivariate_model_summary = summarize_results_with_totals(multivariate_model_df, "Multivariate Models")


final_summary = pd.concat([complex_model_summary, multivariate_model_summary], ignore_index=True)
final_summary

Unnamed: 0,Column,Total Count,Model Type
0,clayton_random+gauss_dist+val,0,Complex Models
1,clayton_random+t_dist+val,0,Complex Models
2,gaussian+gauss_dist+val,0,Complex Models
3,gaussian+t_dist+val,0,Complex Models
4,t_student+gauss_dist+val,0,Complex Models
5,t_student+t_dist+val,2,Complex Models
6,gauss_dist+val,0,Multivariate Models
7,t_dist+val,0,Multivariate Models
