In [61]:
import numpy as np
from scipy import stats

In [62]:
STAT_SIGNIFICANCE = 0.05

In [63]:
def drop_element_if_not_linear(x, y):
    N = x.size
    xs = []
    ys = []
    for i in range(N):
        xs.append(x[i]);
        ys.append(y[i])
        _, _, _, p_value, _ = stats.linregress(xs, ys)
        if p_value > STAT_SIGNIFICANCE:
            xs.pop()
            ys.pop()
    return np.array(xs), np.array(ys)

In [64]:
STEP = 10

def drop_corner_elements(x, y):
    N = x.size
    indexes = []
    for i in range(N):
        dst = (x[i] - y[i])**2
        indexes.append((dst, i))
    indexes = sorted(indexes, reverse=True)
    # Try out different prefixes to drop to find the minumum
    for prefix_length in range(STEP, N, STEP):
        ids = indexes[:prefix_length]
        ids = list(map(lambda pair: pair[1], ids))
        xs = np.delete(x, ids)     
        ys = np.delete(y, ids)
        _, _, _, p_value, _ = stats.linregress(xs, ys)
        if p_value <= STAT_SIGNIFICANCE:
            return xs, ys
    return None

In [65]:
checked = True
iters = 100
N = 10000
min_size = N
for j in range(iters):
    n = N
    x = np.random.uniform(size=n)
    y = np.random.uniform(size=n)
    x, y = drop_corner_elements(x, y)
    min_size = min(min_size, x.size)
    
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    if p_value > STAT_SIGNIFICANCE:
        checked = False
        break

print("Linear dependency:", checked)
print("Min data volume:", min_size)

Linear dependency: True
Min data volume: 9840
