In [None]:
import numpy as np
import sys
import matplotlib.pyplot as plt
from matplotlib import gridspec
import traceback

In [None]:
__M = 4

In [12]:
def std_after_outlier_removal(N=1000, M=10, outliers=0, distr=("SG",1)):
    stds = {i:[] for i in range(outliers+1)}

    for i in range(N):

        if distr[0]=="DG":
            select = np.random.random(size=M)<distr[1]
            stepsizes = np.random.normal(size=M) * distr[2] * select + (1-select)*np.random.normal(size=M) * distr[3]
        elif distr[0]=="SG":
            stepsizes = np.random.normal(size=M)*distr[1]
        elif distr[0]=="SGp":
            stepsizes = np.random.normal(size=M)*distr[1]
            stepsizes[0] = distr[2]
        else:
            raise ValueError("unknown distribution")

        diffs1 = np.array(stepsizes)
        stds[0].append(np.std(diffs1, ddof=1))

        for j in range(outliers):
            avg     = np.mean( diffs1 )
            i_extremum = np.argmax(np.abs(diffs1-avg))
            diffs1 = diffs1[ np.arange(len(diffs1)) != i_extremum ]
            stds[j+1].append(np.std(diffs1, ddof=1))

    return stds


In [None]:
N = 10000

fig0 = plt.figure(figsize=(10,10))
fig0.patch.set_facecolor('white')
ax0 = fig0.add_subplot(111)
ax0.set_xlabel("Change in std %", fontsize=16)
ax0.tick_params(axis='y', which='major', direction="in", labelsize=16, pad = 8)
ax0.tick_params(axis='x', which='major', direction="in", labelsize=16, pad = 12)

ax0.text(0.02, 1-0.01, "N={} M={}".format(N, __M),
                        horizontalalignment='left',
                        verticalalignment='top',
                        transform=ax0.transAxes,
                        fontname='sans-serif',
                        fontweight='bold',
                        fontsize=14)


fig1 = plt.figure(figsize=(10,10))
fig1.patch.set_facecolor('white')
ax1 = fig1.add_subplot(111)
ax1.set_xlabel("Change in std %", fontsize=16)
ax1.tick_params(axis='y', which='major', direction="in", labelsize=16, pad = 8)
ax1.tick_params(axis='x', which='major', direction="in", labelsize=16, pad = 12)

ax1.text(0.02, 1-0.01, "N={} M={}".format(N, __M),
                        horizontalalignment='left',
                        verticalalignment='top',
                        transform=ax1.transAxes,
                        fontname='sans-serif',
                        fontweight='bold',
                        fontsize=14)

for idistr, distr in enumerate([
        ("SG", 1),
        ("DG", 0.9, 1, 10),
        ("SGp",  1, 3),
        ("SGp",  1, 7),
        ("SGp",  1, 10),
    ]):

    d = std_after_outlier_removal(N=N, M=__M, outliers=1, distr=distr)

    x = 100*(np.array(d[1])/d[0] - 1)

    if idistr == 0:
        h = ax0.hist(x, bins=np.linspace(-100, 0, 20), lw=2, density=False, alpha=0.2, align='mid', label=str(distr))
        ax1.bar(h[1][:-1], np.cumsum(h[0])/N, alpha=0.2, width=h[1][1]-h[1][0])
    else:
        h = ax0.hist(x, bins=np.linspace(-100, 0, 20), lw=2, density=False, histtype='step', align='mid', label=str(distr))
        ax1.step(h[1][:-1], np.cumsum(h[0])/N)
    p=-65
    ax0.text(0.3, 1-0.04-0.07*idistr, "$\int_{-100}^{%d}f_{%s}=%.3f$" % (p,  str(distr), np.sum(x<p)/len(x)),
                        horizontalalignment='left',
                        verticalalignment='top',
                        transform=ax0.transAxes,
                        fontname='sans-serif',
                        fontweight='bold',
                        fontsize=14)

ax0.set_ylim(0, ax0.get_ylim()[1]*1.15)
ax0.legend(fontsize=14)


fig0.savefig("hist_std_{}_M{}.png".format(N,__M))
fig1.savefig("hist_std_cum_{}_M{}.png".format(N,__M))


In [None]:
def removed(N=1000,M=10,outliers=0, sigma_multiplier=3, distr=("SG",1)):
    n_removed = []

    for i in range(N):

        if distr[0]=="DG":
            select = np.random.random(size=M)<distr[1]
            stepsizes = np.random.normal(size=M) * distr[2] * select + (1-select)*np.random.normal(size=M) * distr[3]
        elif distr[0]=="SG":
            stepsizes = np.random.normal(size=M)*distr[1]
        elif distr[0]=="SGp":
            stepsizes = np.random.normal(size=M)*distr[1]
            stepsizes[0] = distr[2]
        else:
            raise ValueError("unknown distribution")

        diffs1 = np.array(stepsizes)

        for j in range(outliers):
            avg     = np.mean( diffs1 )
            i_extremum = np.argmax(np.abs(diffs1-avg))
            diffs1 = diffs1[ np.arange(len(diffs1)) != i_extremum ]

        avg = np.mean(diffs1)
        std = np.std(diffs1, ddof=1)
        lim = sigma_multiplier*std
        stepsizes2 = stepsizes[ np.abs( stepsizes-avg )<=lim ]

        n_removed.append(len(stepsizes) - len(stepsizes2))

    return n_removed

In [None]:
plt.hist(removed(N=1000,M=__M, outliers=1, sigma_multiplier=3), bins=np.arange(11)-0.5, histtype='step')

In [None]:
N = 10000

fig = plt.figure(figsize=(10,10))
fig.patch.set_facecolor('white')
ax = fig.add_subplot(111)

ax.set_xlabel("Sigma multiplyer", fontsize=16)
ax.set_ylabel("Average number of excluded stepsizes", fontsize=16)
ax.tick_params(axis='y', which='major', direction="in", labelsize=16, pad = 8)
ax.tick_params(axis='x', which='major', direction="in", labelsize=16, pad = 12)

x = np.linspace(1, 5, 25)

for i in range(2):

    y = []
    for sigma_multiplier in x:
        y.append(np.mean(removed(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SG",1))))

    p = ax.plot(x,y, label="outliers={} (SG,1)".format(i))

    y = []
    for sigma_multiplier in x:
        y.append(np.mean(removed(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("DG",0.9,1,10))))

    ax.plot(x,y, "--", color=p[0].get_color(), label="outliers={} (DG,0.9,1,10)".format(i))

    y = []
    for sigma_multiplier in x:
        y.append(np.mean(removed(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SGp",1, 3))))

    p = ax.plot(x,np.array(y)+0.05, "-.", color=p[0].get_color(), label="outliers={} (SGp,1, 3) +0.05".format(i))

    y = []
    for sigma_multiplier in x:
        y.append(np.mean(removed(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SGp",1, 10))))

    p = ax.plot(x,np.array(y)+0.1, ":", color=p[0].get_color(), label="outliers={} (SGp,1, 10) +0.1".format(i))

ax.legend(fontsize=16)

ax.plot([x[0],x[-1]], [1,1], "r--")

fig.savefig("avg_removed_N{}_M{}.png".format(N,__M))

In [None]:
def removed2(N=1000,M=10,outliers=0, sigma_multiplier=3, distr=("SG",1), r=0.35):
    n_removed = []

    for i in range(N):

        if distr[0]=="DG":
            select = np.random.random(size=M)<distr[1]
            stepsizes = np.random.normal(size=M) * distr[2] * select + (1-select)*np.random.normal(size=M) * distr[3]
        elif distr[0]=="SG":
            stepsizes = np.random.normal(size=M)*distr[1]
        elif distr[0]=="SGp":
            stepsizes = np.random.normal(size=M)*distr[1]
            stepsizes[0] = distr[2]
        else:
            raise ValueError("unknown distribution")

        diffs1 = np.array(stepsizes)

        for j in range(outliers):
            avg     = np.mean( diffs1 )
            i_extremum = np.argmax(np.abs(diffs1-avg))
            diffs1_ = diffs1[ np.arange(len(diffs1)) != i_extremum ]
            if np.std(diffs1_, ddof=1)/np.std(diffs1, ddof=1) < r:
                diffs1 = diffs1_
            else:
                break

        avg = np.mean(diffs1)
        std = np.std(diffs1, ddof=1)
        lim = sigma_multiplier*std
        stepsizes2 = stepsizes[ np.abs( stepsizes-avg )<=lim ]

        n_removed.append(len(stepsizes) - len(stepsizes2))

    return n_removed

In [None]:
N = 10000


for r in [0.25, 0.35, 0.5]:

    fig = plt.figure(figsize=(10,10))
    fig.patch.set_facecolor('white')
    ax = fig.add_subplot(111)

    ax.set_xlabel("Sigma multiplyer", fontsize=16)
    ax.set_ylabel("Average number of excluded stepsizes", fontsize=16)
    ax.tick_params(axis='y', which='major', direction="in", labelsize=16, pad = 8)
    ax.tick_params(axis='x', which='major', direction="in", labelsize=16, pad = 12)

    ax.text(0.02, 1-0.02, "N={} M={}".format(N, __M),
                        horizontalalignment='left',
                        verticalalignment='top',
                        transform=ax.transAxes,
                        fontname='sans-serif',
                        fontweight='normal',
                        fontsize=16)

    ax.text(0.02, 1-0.06, "remove if $\sigma_{new}/\sigma_{old}<$%0.2f" % r,
                            horizontalalignment='left',
                            verticalalignment='top',
                            transform=ax.transAxes,
                            fontname='sans-serif',
                            fontweight='normal',
                            fontsize=16)

    x = np.linspace(1, 5, 25)

    for i in range(2):

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SG",1), r=r)))

        p = ax.plot(x,y, label="outliers={} (SG,1)".format(i))

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("DG",0.9,1,10), r=r)))

        ax.plot(x,y, "--", color=p[0].get_color(), label="outliers={} (DG,0.9,1,10)".format(i))

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SGp",1, 3), r=r)))

        p = ax.plot(x,np.array(y), "-.", color=p[0].get_color(), label="outliers={} (SGp,1, 3)".format(i))

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SGp",1, 10), r=r)))

        p = ax.plot(x,np.array(y), ":", color=p[0].get_color(), label="outliers={} (SGp,1, 10)".format(i))

    ax.legend(fontsize=16)

    ax.plot([x[0],x[-1]], [1,1], "r--")

    fig.savefig("avg_removed2_N{}_M{}_r{}.png".format(N,__M,r))

In [None]:
N = 1000

for r in [0.25, 0.35, 0.5]:

    fig = plt.figure(figsize=(10,10))
    fig.patch.set_facecolor('white')
    ax = fig.add_subplot(111)

    ax.set_xlabel("Sigma multiplyer", fontsize=16)
    ax.set_ylabel("Average number of excluded stepsizes", fontsize=16)
    ax.tick_params(axis='y', which='major', direction="in", labelsize=16, pad = 8)
    ax.tick_params(axis='x', which='major', direction="in", labelsize=16, pad = 12)

    ax.text(0.02, 1-0.02, "N={} M={}".format(N, __M),
                        horizontalalignment='left',
                        verticalalignment='top',
                        transform=ax.transAxes,
                        fontname='sans-serif',
                        fontweight='normal',
                        fontsize=16)

    ax.text(0.02, 1-0.06, "remove if $\sigma_{new}/\sigma_{old}<$%0.2f" % r,
                            horizontalalignment='left',
                            verticalalignment='top',
                            transform=ax.transAxes,
                            fontname='sans-serif',
                            fontweight='normal',
                            fontsize=16)

    x = np.linspace(1, 25, 25)

    for i in range(2):

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SG",1), r=r)))

        p = ax.plot(x,y, label="outliers={} (SG,1)".format(i))

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("DG",0.9,1,10), r=r)))

        ax.plot(x,y, "--", color=p[0].get_color(), label="outliers={} (DG,0.9,1,10)".format(i))

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SGp",1, 3), r=r)))

        p = ax.plot(x,np.array(y), "-.", color=p[0].get_color(), label="outliers={} (SGp,1, 3)".format(i))

        y = []
        for sigma_multiplier in x:
            y.append(np.mean(removed2(N=N,M=__M,outliers=i, sigma_multiplier=sigma_multiplier, distr=("SGp",1, 10), r=r)))

        p = ax.plot(x,np.array(y), ":", color=p[0].get_color(), label="outliers={} (SGp,1, 10)".format(i))

    ax.legend(fontsize=16)

    ax.plot([x[0],x[-1]], [1,1], "r--")

    fig.savefig("avg_removed2_N{}_M{}_r{}_wide.png".format(N,__M,r))

In [None]:
x = removed2(N=1000,M=__M,outliers=1, sigma_multiplier=2.5, distr=("DG",0.9,1,3))
print(np.mean(x))
plt.hist(x, bins=np.arange(11)-0.5, histtype='step')