In [3]:
from helper import timeit
from scipy.stats import f
from scipy.stats import f_oneway, levene, gaussian_kde
from numpy import random
import numpy as np

# "F test for Variance"

@timeit
def test_equality_of_variance(data1, data2):
    n1 = len(data1)
    n2 = len(data2)
    var1 = sum([(x - sum(data1)/n1)**2 for x in data1])/(n1 - 1)
    print("var1 summed")
    var2 = sum([(x - sum(data2)/n2)**2 for x in data2])/(n2 - 1)
    print("var2 summed")
    F = var1/var2
    print('F-ratio:', F)
    
    alpha = 0.05
    if F < 1:
        F = 1/F
    df1 = n1 - 1
    df2 = n2 - 1
    p_value = 1 - f.cdf(F, df1, df2)
    print('p-value:', p_value)
    
    if p_value > alpha:
        print('The variance in the two groups is equal (fail to reject H0)')
    else:
        print('The variance in the two groups is not equal (reject H0)')


In [36]:
@timeit
def test_equality_of_mean_and_variance(data1, data2, alpha=0.05):
    statistic, pvalue_anova = levene(data1, data2)
    statistic, pvalue_levene = levene(data1, data2)
    if pvalue_levene > alpha:
        print('The variance is likely equal (fail to reject H0)')
    else:
        print('The variance is likely not equal (reject H0)')
   


In [27]:
data1 = tuple(random.normal(0,2,25000))
data2 = tuple(random.normal(40,2,25000))

In [37]:
test_equality_of_mean_and_variance(data1, data2)

The variance is likely equal (fail to reject H0)
Function test_equality_of_mean_and_variance took 0.012729 seconds


In [20]:
# Function test_equality_of_variance took 36.279090 seconds
sample = np.random.normal(10,10,10)
print(gaussian_kde(sample).weights)


[0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1]


In [2]:
100*(1+0.03)**10

134.39163793441222

In [74]:
import math, numpy
from helper import timeit

interquartileRange = lambda array: numpy.quantile(array, 0.75)-numpy.quantile(array, 0.25)

@timeit
def silvermansBandwidth(data):
  return 0.9 * min(numpy.std(data), interquartileRange(data)) * (len(data) ** -0.20)

@timeit
def epanechnikov(bandwidth):
  def kernel(x):
    if abs(x/bandwidth) <= 1:
      return 0.75 * (1 - x * x) / bandwidth
    else:
      return 0
  return kernel


generate_thresholds = lambda array: numpy.linspace(numpy.min(array), numpy.max(array), 10)

@timeit
def kde(kernel, thresholds, data):
  def calc_mean(t):
    return sum(kernel(t - d) for d in data) / len(data)
  return [(t, calc_mean(t)) for t in thresholds]

In [69]:
data = numpy.random.normal(0,1,1_000_000)

In [67]:
bandwidth = silvermansBandwidth(data)
thresholds = sorted(generate_thresholds(data))

In [75]:
kde(epanechnikov(bandwidth),thresholds, data)

Function epanechnikov took 0.000001 seconds
Function kde took 4.810963 seconds


[(-5.759347130475287, 0.0),
 (-4.4790331661189295, 3.956124837232064e-05),
 (-3.198719201762572, 0.003905042363852214),
 (-1.9184052374062146, 0.09638123310421638),
 (-0.6380912730498576, 0.4861610085291486),
 (0.6422226913064994, 0.4851170214590126),
 (1.9225366556628574, 0.09542728155969954),
 (3.2028506200192135, 0.003984074384838433),
 (4.483164584375571, 3.9556842338140555e-05),
 (5.763478548731929, 0.0)]

In [58]:
from scipy.stats import gaussian_kde


In [76]:
tuple(zip(thresholds,gaussian_kde(data, bandwidth).evaluate(thresholds)))

((-5.759347130475287, 2.335764603226546e-53),
 (-4.4790331661189295, 2.388013947434793e-05),
 (-3.198719201762572, 0.002553250088841075),
 (-1.9184052374062146, 0.06426199607240565),
 (-0.6380912730498576, 0.3241262401366447),
 (0.6422226913064994, 0.3229418677099397),
 (1.9225366556628574, 0.06344610376992335),
 (3.2028506200192135, 0.002609358121615744),
 (4.483164584375571, 2.3894970160758568e-05),
 (5.763478548731929, 5.527754575013309e-109))