<a href="https://colab.research.google.com/github/microprediction/correlationbounds/blob/main/correlation_lower_upper_bounds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Brute force correlation lower/upper bounds

Laboriously creates a table that can be stored in a database.
The usage is that any time we have an observation of correlation we can just join in the lower and upper bounds. Note that I've hardwired in the threshold parameter 0.025 here and that might be in the table too perhaps.

In [7]:
import numpy as np
import pandas as pd
from scipy.stats import norm

# Generate a correlated sample with true correlation rho
def generate_correlated_sample(n, true_corr):
    x = np.random.normal(size=n)
    # Create y using the correlation structure
    y = true_corr * x + np.sqrt(1 - true_corr**2) * np.random.normal(size=n)
    return x, y

# Function to compute the empirical correlation distribution for a given true correlation
def compute_measured_correlations(true_corr, n, num_simulations=10000):
    correlations = []
    for _ in range(num_simulations):
        x, y = generate_correlated_sample(n, true_corr)
        measured_corr = np.corrcoef(x, y)[0, 1]
        correlations.append(measured_corr)
    return correlations

# Function to compute lower and upper bounds using Bayes' rule
def bayesian_correlation_bounds(measured_corr, n, true_corrs, num_simulations=10000):
    posterior_probabilities = []

    # For each true correlation, compute the likelihood of the measured correlation
    for true_corr in true_corrs:
        measured_corrs = compute_measured_correlations(true_corr, n, num_simulations)
        likelihood = norm.pdf(measured_corr, loc=np.mean(measured_corrs), scale=np.std(measured_corrs))
        posterior_probabilities.append(likelihood)

    # Normalize the probabilities to get a posterior distribution
    posterior_probabilities = np.array(posterior_probabilities)
    posterior_probabilities /= posterior_probabilities.sum()

    # Compute bounds (for simplicity, use percentiles)
    lower_bound_idx = np.argmax(np.cumsum(posterior_probabilities) > 0.025)
    upper_bound_idx = np.argmax(np.cumsum(posterior_probabilities) > 0.975)

    lower_bound = true_corrs[lower_bound_idx]
    upper_bound = true_corrs[upper_bound_idx]

    return lower_bound, upper_bound


bayesian_correlation_bounds(measured_corr=0.3, true_corrs=np.linspace(-0.99, 0.99, 101), n=250)

(0.16500000000000004, 0.4125000000000001)

In [10]:
# Function to generate the correlation bounds table
def generate_correlation_bounds_table(n, measured_correlations, true_corr_range=np.linspace(-0.99, 0.99, 101)):
    data = []
    for measured_corr in measured_correlations:
        lower_bound, upper_bound = bayesian_correlation_bounds(measured_corr, n, true_corr_range)
        data.append({
            'count': n,
            'measured_correlation': round(measured_corr, 3),
            'lower_corr_bound': round(lower_bound, 3),
            'upper_corr_bound': round(upper_bound, 3)
        })

    # Convert the data to a pandas DataFrame for better readability
    df = pd.DataFrame(data)
    return df

bounds_table = generate_correlation_bounds_table(n=100, measured_correlations=np.linspace(-0.9,0.9,21))
print(bounds_table)

    count  measured_correlation  lower_corr_bound  upper_corr_bound
0     100                 -0.90            -0.931            -0.832
1     100                 -0.81            -0.851            -0.713
2     100                 -0.72            -0.792            -0.574
3     100                 -0.63            -0.713            -0.475
4     100                 -0.54            -0.653            -0.356
5     100                 -0.45            -0.574            -0.257
6     100                 -0.36            -0.495            -0.158
7     100                 -0.27            -0.416            -0.079
8     100                 -0.18            -0.356             0.020
9     100                 -0.09            -0.277             0.099
10    100                 -0.00            -0.198             0.178
11    100                  0.09            -0.099             0.277
12    100                  0.18            -0.020             0.356
13    100                  0.27             0.07