In [25]:
import math, copy

import numpy as np
import statsmodels.api as sm

import scipy.stats as st
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from LoadSynthetic import init as init_synth
from LoadSynthetic import *

plotly.offline.init_notebook_mode(connected=True)
mpl.rcParams['text.usetex'] = True
mpl.rcParams['svg.fonttype'] = 'none'
prop_cycle = plt.rcParams['axes.prop_cycle']

colors = plotly.colors.DEFAULT_PLOTLY_COLORS

In [26]:
alpha = 0.1
z = st.norm.ppf(1 - alpha / 2)
test_size = 1000
dim = 5
low, high = 0, 10
save = True
classes = 3
seeds = [574594, 900164,  33991, 205532,  65012, 389774, 169373, 769356,
       724626, 559168, 781740, 199310,  84367, 514742, 570831, 245193,
       724009, 801124, 583112, 724682]


def inflated_quantile(scores, q = 0.9):
    return np.sort(scores)[math.ceil(q * (1 + len(scores))) - 1]

In [27]:
np.random.seed(15)
X = np.random.uniform(low = low, high = high, size = (5000, dim))

sampler = ConditionalNormal(*getStatistics("mean", .1))
y, mean, var = sampler.sample(X)

order = np.argsort(var)
y, mean, var = y[order], mean[order], var[order]

def splitter(var):
    splits = np.quantile(var, [i * (1 / classes) for i in range(classes + 1)])
    splits[0] = -np.infty
    splits[-1] = np.infty
    grid = np.stack([splits[i:i+2] for i in range(len(splits) - 1)], axis = 0)

    def checker(x):
        index = np.zeros_like(x, dtype = int)
        for i in range(grid.shape[0]):
            index[(x >= grid[i, 0]) & (x <= grid[i, 1])] = i
        return index
    
    return checker, splits

true_var = copy.deepcopy(var)
taxonomy, splits = splitter(true_var)
true_indices = taxonomy(var)
labels = ["Low variance", "Medium variance", "High variance"]

true_score = np.abs(mean - y)
true_n_score = np.abs(mean - y) / np.sqrt(var)

selector = lambda x: x >= 0
var_modifier = lambda x, y: 5*((y - .5) ** 2) + .5

select = selector(var)
var[select] = var_modifier(mean[select], var[select])

layout = dict(font = {"size": 20}, title_font = {"size": 30}, title_x = .5, title_y = .99, width = 1000, margin = dict(l = 0, r = 0, t = 30, b = 0), title_automargin = True)

cumsum = 0
fig = go.Figure()
fig.update_layout(layout)
fig.update_layout(xaxis_title = "True variance", yaxis_title = "CDF", title_text = "CDF plot of variance")
fig.update_layout(legend = dict(yanchor="top", y=.3, xanchor="left", x=.815, title = "True taxonomy:", font = {"size": 14}))
fig.update_xaxes(title_standoff = 20)
fig.update_yaxes(title_standoff = 20)
for i in range(classes):
    fig.add_trace(go.Scatter(x = true_var[true_indices == i], y = np.array(range(cumsum, cumsum + np.sum(true_indices == i))) / len(true_var), name = labels[i], mode = "markers", marker = {"color": colors[i], "size": 3}))
    cumsum += np.sum(true_indices == i)
fig.write_image("./PLOTS/CDF_plot_variance.png", scale = 3)
fig.write_image("./PLOTS/CDF_plot_variance.svg", scale = 3)

order = np.argsort(var)
true_var, true_score, true_n_score, true_indices = true_var[order], true_score[order], true_n_score[order], true_indices[order]
y, mean, var = y[order], mean[order], var[order]
taxonomy, splits = splitter(var)
indices = taxonomy(var)

score = np.abs(mean - y)
n_score = np.abs(mean - y) / np.sqrt(var)
int_score = np.abs(mean - y) - z * np.sqrt(var)

fig = go.Figure()
fig.update_layout(layout)
fig.update_layout(xaxis_title = "True variance", yaxis_title = "Estimated variance", title_text = "Variance comparison")
fig.update_layout(legend = dict(yanchor="top", y=.3, xanchor="left", x=.78, title = "Estimated taxonomy:", font = {"size": 15}))
fig.update_xaxes(title_standoff = 20)
fig.update_yaxes(title_standoff = 20)
for i in range(classes):
    fig.add_trace(go.Scatter(x = true_var[indices == i], y = var[indices == i], name = labels[i], mode = "markers", marker = {"color": colors[i], "size": 3}))
fig.write_image("./PLOTS/variance.png", scale = 3)
fig.write_image("./PLOTS/variance.svg", scale = 3)



fig = make_subplots(2, 2, horizontal_spacing = 0.05, vertical_spacing = 0.15, subplot_titles = ["(a) Oracle residual score", "(b) Estimated residual score", "(c) Oracle normalized score", "(d) Estimated normalized score"])
for i in range(classes):
    values, base = np.histogram(true_score[true_indices == i], bins=40)
    cumulative = np.cumsum(values)
    fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(true_score[true_indices == i]), line = {"color": colors[i], "width": 1}, name = labels[i]), row = 1, col = 1)

    values, base = np.histogram(score[indices == i], bins=40)
    cumulative = np.cumsum(values)
    fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(score[indices == i]), showlegend = False, line = {"color": colors[i], "width": 1}), row = 1, col = 2)

    values, base = np.histogram(true_n_score[true_indices == i], bins=40)
    cumulative = np.cumsum(values)
    fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(true_n_score[true_indices == i]), showlegend = False, line = {"color": colors[i], "width": 1}), row = 2, col = 1)

    values, base = np.histogram(n_score[indices == i], bins=40)
    cumulative = np.cumsum(values)
    fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(n_score[indices == i]), showlegend = False, line = {"color": colors[i], "width": 1}), row = 2, col = 2)

values, base = np.histogram(true_score, bins=40)
cumulative = np.cumsum(values)
fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(score), name = "Marginal", line = {"color": colors[-1], "width": 1}), row = 1, col = 1)
fig.add_trace(go.Scatter(x = [base[0], base[-1]], y = [1 - alpha, 1 - alpha], name = "Significance treshold", mode = "lines", line = {"color": "red", "width": 1}), row = 1, col = 1)

values, base = np.histogram(score, bins=40)
cumulative = np.cumsum(values)
fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(score), showlegend = False, line = {"color": colors[-1], "width": 1}), row = 1, col = 2)
fig.add_trace(go.Scatter(x = [base[0], base[-1]], y = [1 - alpha, 1 - alpha], showlegend = False, mode = "lines", line = {"color": "red", "width": 1}), row = 1, col = 2)

values, base = np.histogram(true_n_score, bins=40)
cumulative = np.cumsum(values)
fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(score), showlegend = False, line = {"color": colors[-1], "width": 1}), row = 2, col = 1)
fig.add_trace(go.Scatter(x = [base[0], base[-1]], y = [1 - alpha, 1 - alpha], showlegend = False, mode = "lines", line = {"color": "red", "width": 1}), row = 2, col = 1)

values, base = np.histogram(n_score, bins=40)
cumulative = np.cumsum(values)
fig.add_trace(go.Scatter(x = base[:-1], y = cumulative / len(score), showlegend = False, line = {"color": colors[-1], "width": 1}), row = 2, col = 2)
fig.add_trace(go.Scatter(x = [base[0], base[-1]], y = [1 - alpha, 1 - alpha], showlegend = False, mode = "lines", line = {"color": "red", "width": 1}), row = 2, col = 2)

fig.update_layout(layout)
fig.update_layout(margin = dict(t = 70), title_text = "CDF plot of nonconformity scores")
fig.update_layout(legend = dict(yanchor="top", y=.32, xanchor="left", x=.78, font = {"size": 15}))
fig.update_xaxes(title_standoff = 20)
fig.update_yaxes(title_standoff = 20)

fig.write_image("./PLOTS/CDF_combi.svg", scale = 3)
fig.write_image("./PLOTS/CDF_combi.png", scale = 3)

In [None]:
x = np.arange(0, 5, 0.0001)
plt.figure(figsize = (12, 8))
plt.title("Exponential distribution with rate parameter " + r"$\xi$", fontsize = 15)
for i, xi in enumerate([0.5, 1, 1.5]):
    plt.scatter(x, np.exp(-x * xi) * xi, color = colors[i], label = r"$\xi=" + str(xi) + r"$", s = .01)
plt.plot([0, 5], [0, 0], linestyle = "dashed", color = "grey")

legend = plt.legend(prop={'size': 15})
for marker in legend.legendHandles:
    marker._sizes = [10]
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

plt.savefig("Exponential.png", dpi = 500, bbox_inches = "tight")
plt.show()
plt.close()


plt.figure(figsize = (12, 8))
plt.title("Pareto distribution with shape parameter " + r"$\xi$", fontsize = 15)
for i, xi in enumerate([0.5, 1, 1.5]):
    plt.scatter(x, xi / np.power(x + 1, xi + 1), color = colors[i], label = r"$\xi=" + str(xi) + r"$", s = .01)
plt.plot([0, 5], [0, 0], linestyle = "dashed", color = "grey")

legend = plt.legend(prop={'size': 15})
for marker in legend.legendHandles:
    marker._sizes = [10]
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

plt.savefig("Pareto.png", dpi = 500, bbox_inches = "tight")
plt.show()
plt.close()

In [None]:
sum = 0
y = 0
while sum < 0.7:
    y += 0.001
    sum += 0.5 * math.e ** (-0.5 * y) * 0.001

y2 = y
while sum < 0.8:
    y2 += 0.001
    sum += 0.5 * math.e ** (-0.5 * y2) * 0.001

x = np.arange(0, 5, 0.0001)
x2 = np.arange(0, y, 0.0001)
x3 = np.arange(0, y2, 0.0001)
plt.figure(figsize = (12, 8))
plt.title("Exponential distribution with rate parameter " + r"$\xi=0.5$", fontsize = 15)
plt.scatter(x, np.exp(-0.5 * x) * 0.5, color = colors[0], s = .01)
plt.fill_between(x2, np.exp(-0.5 * x2) * 0.5, alpha = 0.2, label = "70\% interval")
plt.fill_between(x3, np.exp(-0.5 * x3) * 0.5, alpha = 0.2, label = "80\% interval")
plt.plot([0, 5], [0, 0], linestyle = "dashed", color = "grey")

plt.legend(prop={'size': 15})
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

plt.savefig("Exponential_70.png", dpi = 500, bbox_inches = "tight")
plt.show()
plt.close()

################################

sum = 0
y = 0
while sum < 0.7:
    y += 0.001
    sum += 0.5 / ((y + 1) ** 1.5) * 0.001

y2 = y
while sum < 0.8:
    y2 += 0.001
    sum += 0.5 / ((y2 + 1) ** 1.5) * 0.001

x = np.arange(0, math.ceil(y2), 0.0001)
x2 = np.arange(0, y, 0.0001)
x3 = np.arange(0, y2, 0.0001)
plt.figure(figsize = (12, 8))
plt.title("Pareto distribution with shape parameter " + r"$\xi=0.5$", fontsize = 15)
plt.scatter(x, 0.5 / np.power(x + 1, 1.5), color = colors[0], s = .01)
plt.fill_between(x2, 0.5 / np.power(x2 + 1, 1.5), alpha = 0.2, label = "70\% interval")
plt.fill_between(x3, 0.5 / np.power(x3 + 1, 1.5), alpha = 0.2, label = "80\% interval")
plt.plot([0, math.ceil(y2)], [0, 0], linestyle = "dashed", color = "grey")

plt.legend(prop={'size': 15})
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)

plt.savefig("Pareto_70.png", dpi = 500, bbox_inches = "tight")
plt.show()
plt.close()

In [None]:
seed = 1
x = np.arange(0, 10)
y = 0.1 * np.arange(0, 10)
np.random.seed(seed)
y0 = y + np.random.normal(0, 0.1, x.shape)
y1 = y + 2 + np.random.normal(0, 0.5, x.shape)

a = np.abs(np.concatenate([y0 - y, y1 - y - 2]))
crit = np.sort(a)[math.ceil(min((1 + (1 / a.shape[0])) * 0.8, 1) * a.shape[0]) - 1]

colors = plotly.colors.qualitative.Plotly

fig = go.Figure()
fig.add_trace(go.Scatter(x = x, y = y0, line_color = colors[0], name = "Subgroup 1"))
fig.add_trace(go.Scatter(x = x, y = y1, line_color = colors[1], name = "Subgroup 2"))
fig.add_trace(go.Scatter(x = x, y = y, error_y = dict(type = "constant", value = crit, thickness = 1), mode = "lines", line = {"dash": "dash", "color": colors[0]}, showlegend = False))
fig.add_trace(go.Scatter(x = x, y = y + 2, error_y = dict(type = "constant", value = crit, thickness = 1), mode = "lines", line = {"dash": "dash", "color": colors[1]}, showlegend = False))

fig.update_layout(dict(font = {"size": 15}, width = 1000, margin = dict(l = 0, r = 0, t = 0, b = 0), xaxis_title = r"$x$", yaxis_title = r"$y$"))
fig.update_layout(legend = dict(yanchor="top", y = 0.29, xanchor="left", x = .852, font = {"size": 15}, borderwidth = 1))
fig.show()

fig.write_image("./PLOTS/Heteroskedasticity.png", scale = 3)
fig.write_image("./PLOTS/Heteroskedasticity.svg", scale = 3)