In [1]:
import numpy as np
import holoviews as hv
import pandas as pd
from scipy import stats
from scipy import special

In [2]:
#!conda install holoviews

In [3]:
#!jupyter labextension install @pyviz/jupyterlab_pyviz


In [4]:
hv.extension('bokeh')

In [5]:
from collections import Counter

In [6]:
from matplotlib import pyplot as plt
%matplotlib inline

In [7]:

def vcorrcoef(X,y):
    Xm = np.reshape(np.mean(X,axis=1),(X.shape[0],1))
    ym = np.mean(y)
    r_num = np.sum((X-Xm)*(y-ym),axis=1)
    r_den = np.sqrt(np.sum((X-Xm)**2,axis=1)*np.sum((y-ym)**2))
    r = r_num/r_den
    return r


def prepare_cov_matrix(rho):
    return np.array([[1., rho], [rho, 1.]])

In [8]:
### TO move
def task1_analytical(p_a=1e-4, n=100):
    x1 = (1 - 0.5 * p_a) ** (n-1) * p_a
    x2 = ( 0.5 * (1 - p_a)) ** (n-1) * (1 - p_a)
    res = x1 / (x1 + x2)
    print x1, x2
    return res

task1_analytical()

9.95062107917e-05 1.56202243507e-30


1.0

# Задание 1

In [9]:
def calculate_posterior_probabilities(n, prob_A, p1, p2):
    """
    compute P(A|B)
    
    A - [person is oracle]
    B - [n successes in n Bernouli experiments]
    p1 = P(B|A)
    p2 = P(B|~A)
    """
    vec_likelihood = np.array([p1**n, p2**n])
    vec_prior_prob = np.array([prob_A, 1 - prob_A])
    vec = vec_prior_prob * vec_likelihood
    return vec / vec.sum()


calculate_posterior_probabilities(10, 1e-4, 0.9, 0.5)

array([0.03447713, 0.96552287])

In [10]:
def task1_simulate_one_experiment(n, k, prob_oracle, p1, p2):
    """
    consider `n` people, each making `k` predictions.
    i-th person is oracle with probability `prob_oracle`
    if he is an oracle, then he makes a right guess in j-th experiment with probability `p1`
    otherwise with probability `p2`
    
    Let:
      x_i ~ Bern(prob_oracle)
      y_i|x_i ~ Bin(0.5 + 0.4 * x, k)
      z = argmax(y_i)       # we select the best person
          
    what is the histogram of y_z?
    what is     P(x_z == 1 | y, x) ?
    """
    x = np.random.binomial(n=1, p=prob_oracle, size=n)
    y = np.random.binomial(n=k, p=p2 + (p1 - p2) * x)
#    print "x.sum():", x.sum()
#    print "np.where(x)", np.where(x)[0]
    z = np.argmax(y)
    y_z = y[z]
    x_z = x[z]
    return z, y_z, x_z

In [11]:
def task1_simulate_many(k, n, ITER=10000):
    data = []
    for i in range(ITER):
        z, y_z, x_z = task1_simulate_one_experiment(n=n, k=k, prob_oracle=1e-4, p1=0.9, p2=0.5)
        data.append((z, y_z, x_z))
    
    df = pd.DataFrame(data, columns = ["z", "y_z", "x_z"])
    return df

In [13]:
data1 = []
for k in [1,2,3,4,5, 10, 20, 50, 100, 200, 400, 600, 800, 1000]:
    df = task1_simulate_many(k=k, n=100)
    prob_oracle = float(df["x_z"].sum()) / df["x_z"].size
    data1.append((k, prob_oracle))
    print k

1
2
3
4
5
10
20
50
100
200
400
600
800
1000


In [14]:
data2 = []
for n in [1,2,3,4,5, 10, 20, 50, 100, 200, 400, 600, 800, 1000]:
    df = task1_simulate_many(k=100, n=n)
    prob_oracle = float(df["x_z"].sum()) / df["x_z"].size
    data2.append((n, prob_oracle))
    print n

1
2
3
4
5
10
20
50
100
200
400
600
800
1000


In [15]:
%%opts Curve [show_grid=True tools=["hover"]]

df1 = pd.DataFrame(data1, columns=["k", "prob"])
df2 = pd.DataFrame(data2, columns=["n", "prob"])


c1 = hv.Curve(df1, kdims=["k"], vdims=["prob"]).relabel("n=100")
c1 = c1.redim.range(prob=((0, 0.02)))
c2 = hv.Curve(df2, kdims=["n"], vdims=["prob"]).relabel("k=100")
c2 = c2 #.redim.range(prob=((0, 0.02)))

c1 + c2

In [16]:


%%opts Histogram Bars [width=400 xrotation=90] {+axiswise}

hv.Histogram(np.histogram(df["z"], bins=100) ) + \
hv.Histogram(np.histogram(df["y_z"], bins=100) ) + \
hv.Histogram(np.histogram(df["x_z"], bins=100) )

# Задание 3

In [17]:
class StatisticsSlide8a(stats.rv_continuous):    
    """Let X, Y ~ N([0,0]', [[1, rho], [rho, 1]])
    Define:
    T(Z) = 1. / (2*n) * sum (x_i - y_i) ** 2
    
    Then:
    T(Z) = (1 - rho) / n * xi, where xi ~ chi2(n)
    """
    def __init__(self, rho, n, **kwargs):
        stats.rv_continuous.__init__(self, a=0, **kwargs)
        self.rho = rho
        self.n = n
        scale = (1-rho) / float(n)
        self._rv = stats.chi2(df=n, scale=scale)
    
    def _pdf(self, t):
        return self._rv.pdf(t)
    
    def _cdf(self, t):
        return self._rv.cdf(t)

    def _rvs(self, *vargs, **kwargs):
        n = self.n
        rho = self.rho
        shape = (n,) + self._size
        z = np.random.multivariate_normal(mean=np.zeros(2), cov=prepare_cov_matrix(rho=rho), size=shape)
        z = np.rollaxis(z, len(z.shape)-1, 0)
        
        x = z[0,:]
        y = z[1,:]
        res = 1. / (2. * n) * np.sum((x - y)**2, axis=0)
        return res


class SampleCorrelationOfTwoIndependentNormals(stats.rv_continuous):
    
    """In the special case when {\displaystyle \,\rho =0} \,\rho =0
    https://en.wikipedia.org/wiki/Pearson_correlation_coefficient#Using_the_exact_distribution
    """
    def __init__(self, rho, n, **kwargs):
        stats.rv_continuous.__init__(self, a=-1, b=1, **kwargs)
        self.rho = rho
        self.n = n
    
    def _pdf(self, r):
        assert self.rho == 0
        n = self.n
        return (1 - r**2) ** (0.5 * (n-4)) / special.beta(0.5, 0.5 * (n-2))
    
    def _rvs(self, *vargs, **kwargs):
        n = self.n
        rho = self.rho
        shape = (n,) + self._size
        z = np.random.multivariate_normal(mean=np.zeros(2), cov=prepare_cov_matrix(rho=rho), size=shape)
        z = np.rollaxis(z, len(z.shape)-1, 0)
        x = z[0,:]
        y = z[1,:]
        res = 1. / n * np.sum(x * y, axis=0)
        return res

In [18]:
rv = SampleCorrelationOfTwoIndependentNormals(0., 100)

rv.rvs(size=10)

array([-0.04126727,  0.02332062,  0.04256878, -0.00470834, -0.14931025,
        0.31091468, -0.17780925,  0.03334715,  0.06710224,  0.01049417])

In [19]:
    


StatisticsSlide8b = SampleCorrelationOfTwoIndependentNormals

rv1 = StatisticsSlide8a(n=100, rho=0)
rv2 = StatisticsSlide8a(n=100, rho=0.5)


xs = np.linspace(-1, 3, 101)
ys1 = rv1.pdf(xs)
ys2 = rv2.pdf(xs)

%time sample1 = rv1.rvs(size=(55, 10, 5)) 


#hv.Curve((xs, ys1), label="rho=0.") * hv.Curve((xs, ys2), label="rho=0.5")

print sample1.shape

hist1 = hv.Distribution(sample1.ravel())
hist1

Wall time: 50 ms
(55L, 10L, 5L)


In [21]:



def compute_T_of_Z(rho, n):
    z = np.random.multivariate_normal(mean=np.zeros(2), cov=prepare_cov_matrix(rho=rho), size=n)
    x = z[:, 0]
    y = z[:, 1]
#    print z.shape
#    rv = StatisticsSlide8a(n=n, rho=rho)
    res = {}
    res["rho"] = rho
    res["slide7"] = 1. / np.sqrt(2. * n) * (x - y).sum()
    res["slide8a"] = 1. / (2. * n) * np.sum((x - y)**2)
    res["slide8b"] = np.sum(x * y) / float(n)
#    res["slide8a_"] = rv.rvs()
    return res

In [22]:
ITER = 10000 # number of experiments


N = 100

data = []
for i in range(ITER):
    x = compute_T_of_Z(rho=0.5, n=N)
    data.append(x)
    x = compute_T_of_Z(rho=0.0, n=N)
    data.append(x)

    
df = pd.DataFrame(data)
df.head()

Unnamed: 0,rho,slide7,slide8a,slide8b
0,0.5,-0.174742,0.483413,0.543496
1,0.0,-0.73545,1.012035,-0.086067
2,0.5,-0.086423,0.562352,0.513728
3,0.0,-2.265286,1.112887,-0.178823
4,0.5,0.171153,0.322322,0.706523


## Slide 7

In [23]:
layout = []

for rho in [0, 0.5]:
    rv = stats.norm(scale=np.sqrt(1 - rho))
    x = np.linspace(-3, 3, 101)
    y = rv.pdf(x=x)

    fig = (hv.Histogram(np.histogram(df[df.rho == rho]["slide7"], bins=1000, normed=True)) * hv.Curve((x,y))).relabel(str(rho))
    layout.append(fig)

In [24]:
layout[0] + layout[1]

In [25]:
%%opts Curve [width=500]
%%opts VLine.interval (color='black')


def generate_normal_curve(variance):
    scale = np.sqrt(variance)
    rv = stats.norm(scale=scale)
    x = np.linspace(-5 * scale, 5 * scale, 101)
    y = rv.pdf(x=x)
    return (x, y)

def generate_normal_inner_critical_region(variance, alpha):
    scale = np.sqrt(variance)
    rv = stats.norm(scale=scale)
    return (rv.ppf(0.5 - alpha*0.5), rv.ppf(0.5 + alpha*0.5))


rho = 0.1
lv, rv = generate_normal_inner_critical_region(1. - 0., 0.05) 



hv.Curve(generate_normal_curve(1 - 0.), label="rho=0") * hv.Curve(generate_normal_curve(1 - rho), label="rho=%.1f" % rho) * hv.VLine(lv, group="interval") * hv.VLine(rv, group="interval")

In [26]:
%%opts Curve [width=600]

def compute_prob_H1_H1(rho, alpha=0.05):
    lv, rv = generate_normal_inner_critical_region(1. - 0., alpha) 
    scale = np.sqrt(1 - rho)
    rv_H1 = stats.norm(scale=scale)
    return rv_H1.cdf(rv) - rv_H1.cdf(lv)

data = []
for rho in np.linspace(0, 1, 1001):
    res = compute_prob_H1_H1(rho=rho)
    data.append((rho, res))
    
data = pd.DataFrame(data, columns=["rho", "p(H_1|H_1)"])
hv.Curve(data, "rho", )

  x = np.asarray((x - loc)/scale, dtype=dtyp)


## Slide 8

### a

$T(Z) = \frac{1}{2n} \sum_{i=1}^n (x_i - y_i)^2$

In [27]:
%%opts Histogram [width=600] (alpha=0.05)
%%opts Curve.baseline (line_width=5 color='black')

def generate_stat_distrib_slide8a(rho, N):
    rv = stats.chi2(df=N, scale=(1-rho) / float(N))
    x = np.linspace(0, 2, 1001)
    y = rv.pdf(x=x)
    return (x,y)

print rho 
hv.Histogram(np.histogram(df[df.rho == 0.5]["slide8a"], bins=1000, normed=True)) \
* hv.Histogram(np.histogram(df[df.rho == 0]["slide8a"], bins=1000, normed=True)) \
* hv.Curve(generate_stat_distrib_slide8a(rho, N), label="rho=%.2f" % rho) \
* hv.Curve(generate_stat_distrib_slide8a(0, N), label="rho=0", group="baseline") \
* hv.Curve(generate_stat_distrib_slide8a(-rho, N), label="rho=%.2f" % -rho) 



1.0


  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)


In [28]:
def compute_prob_H1_H1_slide8a(rho, N, alpha=0.05):
    rv0 = stats.chi2(df=N, scale=(1-0.) / float(N))
    rvA = stats.chi2(df=N, scale=(1-rho) / float(N))
    x0 =  rv0.ppf(alpha)
    return rvA.cdf(x0)

compute_prob_H1_H1_slide8a(0.5, N=1000)   



1.0

In [29]:
data = []
for rho in np.linspace(0, 1, 11)[1:-1]:
    for n in [10, 20, 100, 500, 2500, 10000]:
        power = compute_prob_H1_H1_slide8a(rho, n, 0.05)
        data.append((rho, n, power))
        
df = pd.DataFrame(data, columns=["rho", "n", "power"])
df.head()

Unnamed: 0,rho,n,power
0,0.1,10,0.071318
1,0.1,20,0.085881
2,0.1,100,0.171897
3,0.1,500,0.496445
4,0.1,2500,0.981568


In [30]:
%%opts NdOverlay [legend_position="right" width=600 show_grid=True]
%%opts NdOverlay.a [logx=True]


#df.pivot_table(columns="n", values="power", index="rho")
d = hv.Dataset(df, kdims=["n", "rho"])
d.to.curve(kdims=["n"]).overlay(group="a").relabel("power(n)") + d.to.curve(kdims=["rho"]).overlay(group="b").relabel("power(rho)")

### b
$T(Z) = \frac{1}{n} \sum_{i=1}^n x_i y_i  $

In [31]:
%%opts Histogram [width=600] (alpha=0.05)
%%opts Curve.baseline (line_width=5 color='black')
%%opts Curve [show_grid=True]

def compute_variance_of_product_of_standard_normals(rho):
    """empirically
    
    Analytical formula is 1 + rho**2
    """
    rv = stats.multivariate_normal(cov=prepare_cov_matrix(rho))
    z = rv.rvs(100000)
    x = z[:, 0]
    y = z[:, 1]
    variance = np.var(x * y)
    return variance

def build_analytical_distribution_of_T(rho, N):
#    stddev_xy = np.sqrt(compute_variance_of_product_of_standard_normals(rho))
    stddev_xy = np.sqrt(1 + rho**2)
    rv = stats.norm(loc=rho, scale= stddev_xy / np.sqrt(N))
    return rv


def generate_stat_distrib_slide8b(rho, N, x_min=0, x_max=2.):
    rv = build_analytical_distribution_of_T(rho, N)
    x = np.linspace(x_min, x_max, 1001)
    y = rv.pdf(x=x)
    return (x,y)

print N

x_min = df["slide8b"].min()
x_max = df["slide8b"].max()
print x_min, x_max


hv.Histogram(np.histogram(df[df.rho == 0.5]["slide8b"], bins=1000, normed=True), group="alternative")  \
* hv.Histogram(np.histogram(df[df.rho == 0]["slide8b"], bins=1000, normed=True), group="baseline") \
* hv.Curve(generate_stat_distrib_slide8b(0.5, N, x_min, x_max), label="rho=0.05", group="alternative")  \
* hv.Curve(generate_stat_distrib_slide8b(0, N, x_min, x_max), label="rho=0", group="baseline")


100


KeyError: 'slide8b'

In [32]:
compute_variance_of_product_of_standard_normals(0.5)

1.2474335984524778

In [33]:
df.head()

Unnamed: 0,rho,n,power
0,0.1,10,0.071318
1,0.1,20,0.085881
2,0.1,100,0.171897
3,0.1,500,0.496445
4,0.1,2500,0.981568


In [34]:
def compute_prob_H1_H1_slide8b_analytically(rho, N, alpha=0.05):
    rv0 = build_analytical_distribution_of_T(0., N)
    rvA = build_analytical_distribution_of_T(rho, N)
    x0 =  rv0.ppf(1 - alpha)
    return 1. - rvA.cdf(x0)

def compute_prob_H1_H1_slide8b_empirically(rho, N, alpha=0.05):
    def compute_t_of_z(rho, n):
        rv = stats.multivariate_normal(cov=prepare_cov_matrix(rho))
        z = rv.rvs(N)
        x = z[:, 0]
        y = z[:, 1]
        return np.sum(x * y) / float(n)
    data = []
    for _ in range(1000):
        t_0 = compute_t_of_z(0, N)
        t_rho = compute_t_of_z(rho, N)
        data.append((t_0, t_rho))
    df = pd.DataFrame(data, columns=["H0", "H1"])
    x0 = df["H0"].quantile(1. - alpha)
#    plt.hist(df["H0"], alpha=0.5)
#    plt.hist(df["H1"])

    res = (df["H1"] > x0).sum() / float(len(df["H1"]))
    return res
    
def task3_power_analytical(rho, n):
    rv = stats.norm()
    res = 1. - rv.cdf(1. / np.sqrt(1 + rho**2) * (-np.sqrt(n) * rho + rv.ppf(0.95)))
    return res

print compute_prob_H1_H1_slide8b_analytically(0.1, N=100)   
print compute_prob_H1_H1_slide8b_empirically(0.1, N=100)   



0.260549145391747
0.245


In [196]:
df.rho.value_counts().get(0.2)

In [215]:
Ns = [10, 20, 100, 500, 2500, 10000]
RHOs = list(np.linspace(0, 1, 11)[0:-1]) + [0.05, 0.01, 0.02]
data = []
cached = None #df_power_b
if cached is not None:
    data = cached.values.tolist()
for rho in RHOs[:]:
    if cached is not None and cached.rho.value_counts().get(rho, 0) == len(Ns):
        print "skipping", rho
        continue
    print "starting", rho
    for n in Ns:
        power_analytical = compute_prob_H1_H1_slide8b_analytically(rho, n, 0.05)
        power_empirical  = compute_prob_H1_H1_slide8b_empirically(rho, n, 0.05)
        power_8a  = compute_prob_H1_H1_slide8a(rho, n, 0.05)
        power_analytical_explicit = task3_power_analytical(rho, n)
        data.append([rho, n, power_analytical, power_empirical, power_8a, power_analytical_explicit])
    print "done"
        
df_power_b = pd.DataFrame(data, columns=["rho", "n", "power_a", "power_e", "power_lecture", "power_a_explicit"])
df_power_b["diff(power(lecture),power(hw))"] = df_power_b["power_lecture"] - df_power_b["power_a"]
df_power_b.head()

starting 0.0
done
starting 0.1
done
starting 0.2
done
starting 0.3
done
starting 0.4
done
starting 0.5
done
starting 0.6
done
starting 0.7
done
starting 0.8
done
starting 0.9
done
starting 0.05
done
starting 0.01
done
starting 0.02
done


Unnamed: 0,rho,n,power_a,power_e,power_lecture,power_a_explicit,"diff(power(lecture),power(hw))"
0,0.0,10,0.05,0.05,0.05,0.05,-1.387779e-17
1,0.0,20,0.05,0.042,0.05,0.05,-2.0816680000000002e-17
2,0.0,100,0.05,0.057,0.05,0.05,-5.5511150000000004e-17
3,0.0,500,0.05,0.05,0.05,0.05,-2.0816680000000002e-17
4,0.0,2500,0.05,0.059,0.05,0.05,3.330669e-16


In [237]:
df_power_b = pd.DataFrame(data, columns=["rho", "n", "power_a", "power_e", "power_lecture", "power_a_explicit"])
df_power_b["diff(power(lecture),power(hw))"] = df_power_b["power_lecture"] - df_power_b["power_a"]
df_power_b["diff(power(empirical),power(analytical))"] = df_power_b["power_e"] - df_power_b["power_a"]
df_power_b.head()

Unnamed: 0,rho,n,power_a,power_e,power_lecture,power_a_explicit,"diff(power(lecture),power(hw))","diff(power(empirical),power(analytical))"
0,0.0,10,0.05,0.05,0.05,0.05,-1.387779e-17,-4.1633360000000003e-17
1,0.0,20,0.05,0.042,0.05,0.05,-2.0816680000000002e-17,-0.008
2,0.0,100,0.05,0.057,0.05,0.05,-5.5511150000000004e-17,0.007
3,0.0,500,0.05,0.05,0.05,0.05,-2.0816680000000002e-17,-4.1633360000000003e-17
4,0.0,2500,0.05,0.059,0.05,0.05,3.330669e-16,0.009


In [239]:
%%opts NdOverlay [legend_position="right" width=600 show_grid=True]
%%opts NdOverlay.power_of_n [logx=True logy=False]
%%opts Curve [tools=["hover"]]


interesting_rhos = sorted([0.05] + list(np.linspace(0, 1, 11)))

#df.pivot_table(columns="n", values="power", index="rho")
power_b = hv.Dataset(df_power_b, kdims=["n", "rho"])
(
power_b.select(rho=interesting_rhos).to.curve(kdims=["n"], vdims=["power_a", "power_e", "power_lecture"]).overlay(group="power_of_n").relabel("analytical") 
+ power_b.to.curve(kdims=["rho"], vdims=["power_a", "power_e", "power_lecture"]).overlay(group="power_of_rho").relabel("analytical")
+ power_b.select(rho=interesting_rhos).to.curve(kdims=["n"], vdims=["power_e", "power_a", "power_lecture"]).overlay(group="power_of_n").relabel("empirical") 
+ power_b.to.curve(kdims=["rho"], vdims=["power_e", "power_a", "power_lecture"]).overlay(group="power_of_rho").relabel("empirical")
+ power_b.select(rho=interesting_rhos).to.curve(kdims=["n"], vdims=["power_lecture", "power_e", "power_a"]).overlay(group="power_of_n").relabel("from_lecture") 
+ power_b.to.curve(kdims=["rho"], vdims=["power_lecture", "power_e", "power_a"]).overlay(group="power_of_rho").relabel("from_lecture")

+ power_b.select(rho=interesting_rhos).to.curve(kdims=["n"], vdims=["diff(power(lecture),power(hw))", "power_lecture", "power_e"]).overlay(group="power_of_n").relabel("diff(lecture, hw)") 
+ power_b.to.curve(kdims=["rho"], vdims=["diff(power(lecture),power(hw))","power_lecture", "power_e"]).overlay(group="power_of_rho").relabel("diff(lecture, hw)")
+ power_b.select(rho=interesting_rhos).to.curve(kdims=["n"], vdims=["diff(power(empirical),power(analytical))", "power_lecture", "power_e"]).overlay(group="power_of_n").relabel("diff(empirical, analytical)") 
+ power_b.to.curve(kdims=["rho"], vdims=["diff(power(empirical),power(analytical))","power_lecture", "power_e"]).overlay(group="power_of_rho").relabel("diff(empirical, analytical)")

).cols(2)

In [216]:
df_power_b["diff(power(lecture),power(hw))"] = df_power_b["power_lecture"] - df_power_b["power_a"]
df_power_b

Unnamed: 0,rho,n,power_a,power_e,power_lecture,power_a_explicit,"diff(power(lecture),power(hw))"
0,0.00,10,0.050000,0.050,0.050000,0.050000,-1.387779e-17
1,0.00,20,0.050000,0.042,0.050000,0.050000,-2.081668e-17
2,0.00,100,0.050000,0.057,0.050000,0.050000,-5.551115e-17
3,0.00,500,0.050000,0.050,0.050000,0.050000,-2.081668e-17
4,0.00,2500,0.050000,0.059,0.050000,0.050000,3.330669e-16
5,0.00,10000,0.050000,0.055,0.050000,0.050000,-4.302114e-16
6,0.10,10,0.093079,0.082,0.071318,0.093079,-2.176082e-02
7,0.10,20,0.116690,0.089,0.085881,0.116690,-3.080922e-02
8,0.10,100,0.260549,0.277,0.171897,0.260549,-8.865256e-02
9,0.10,500,0.721828,0.733,0.496445,0.721828,-2.253831e-01


2.2204460492503131e-16

In [210]:
def task3_power_analytical(rho, n):
    rv = stats.norm()
    res = 1. - rv.cdf(1. / np.sqrt(1 + rho**2) * (-np.sqrt(n) * rho + rv.ppf(0.95)))
    return res

task3_power_analytical(0.1, 100)

0.26054914539174701

In [208]:
data2 = []
for rho in RHOs[:]:
    for n in Ns:
        power_8a  = task3_power_analytical(rho, n, 0.05)
        data2.append([rho, n, power_analytical, power_empirical, power_8a])
    print "done"

0.94999999999999996

In [229]:
%%opts Curve [width=500]

rv = stats.norm()
xs = np.linspace(-10, 10, 101)
ys1 = special.expit(1.8 * xs)
ys2 = rv.cdf(xs)

hv.Curve((xs, ys1), label="approx") * hv.Curve((xs, ys2), label="real")

# Задание 2

In [203]:
n = 20
p = 0.1
rv = stats.binom(n=n, p=p)

In [204]:
%%opts VLine (color='red')
x = np.arange(n+1)
y = rv.pmf(x)
hv.Bars((x, y)) * hv.VLine(n * p)

# Задание 4

In [48]:
n = 12
rho = 0.97


def task4_single_rho(n, rho_0, n_iter=100000, step_size=1000):
    rv = stats.norm()
    x = rv.rvs(n)
    i = 0
    data = []
    for i in xrange(n_iter):
        y = rv.rvs((step_size, n))
        #print x
        #print y
        c = vcorrcoef(y, x)
        #print c
        idx = np.where(c > rho_0)[0]
        if len(idx) > 0:
            return i * step_size + idx[0] + 1
    return None

def task4_multiple_rhos(n, rhos, **kwargs):
    from time import time
    start = time()
    res = []
    for rho in rhos:
        k = task4_single_rho(n, rho, **kwargs)
        res.append(k)
    elapsed = time() - start
    print "Elapsed:", elapsed
    return np.array(res)

rhos = np.linspace(0, 0.99, 100)[:-1]
#print len(rhos)
#for i in range(3):
#    %time print i, np.sum(task4_multiple_rhos(n=12, rhos=rhos, n_iter=1000000000, step_size=8000))

    
#%time task4_single_rho(n, 0.97, step_size=10000)

In [49]:
from concurrent.futures import ProcessPoolExecutor, as_completed

ppe = ProcessPoolExecutor()

In [50]:
tasks = []

for i in range(100):
    task = ppe.submit(task4_multiple_rhos,n=12, rhos=rhos, n_iter=10000000000, step_size=8000)
    tasks.append(task)

data = []
for task in as_completed(tasks):
    res = task.result()
    if res is None:
        print "None"
        continue
    data.append(res)
    
        



Elapsed: 64.0836968422
Elapsed: 64.0879459381
Elapsed: 64.1207511425
Elapsed: 64.1907069683
Elapsed: 69.018914938
Elapsed: 69.1955850124
Elapsed: 69.2720820904
Elapsed: 69.3372719288
Elapsed: 14.2615749836
Elapsed: 14.1597380638
Elapsed: 14.3811891079
Elapsed: 14.6177198887
Elapsed: 5.55483198166
Elapsed: 5.5459561348
Elapsed: 5.55857992172
Elapsed: 5.52711296082
Elapsed: 149.011790991
Elapsed: 148.907444
Elapsed: 150.136615038
Elapsed: 150.368600845
Elapsed: 12.0990378857
Elapsed: 12.0744040012
Elapsed: 12.011318922
Elapsed: 12.0358018875
Elapsed: 66.5054020882
Elapsed: 66.599629879
Elapsed: 67.0057828426
Elapsed: 67.0453660488
Elapsed: 44.962857008
Elapsed: 45.2738101482
Elapsed: 45.1175508499
Elapsed: 45.037416935
Elapsed: 62.6123011112
Elapsed: 62.3166451454
Elapsed: 62.8809459209
Elapsed: 63.0645430088
Elapsed: 23.3430700302
Elapsed: 23.1628308296
Elapsed: 23.1454870701
Elapsed: 23.2081699371
Elapsed: 44.3164279461
Elapsed: 44.3223118782
Elapsed: 44.2191791534
Elapsed: 44.45145797

In [51]:
Ks_simulated = np.array(data).mean(0)

In [None]:
f = hv.Histogram(np.histogram(df["c"], bins=100))
f.redim.range(x=(-1, 1))

In [None]:
rv = stats.norm(loc=0, scale=1./np.sqrt(12))

p =  (1. - rv.cdf(0.97))
print "p =", p
print "E =", 1./p
print "med = ", -1 / np.log2(1 - p)

In [71]:
%%opts Curve [logy=True width=500 tools=["hover"]]
%%opts NdOverlay [legend_position='top']
%%opts VLine (color='black' line_width=0.3)


 


def task4_find_analytical(rho, n, method):
    assert method in ["exact", "approximate"]
    if method == "approximate":
        rv = stats.norm(loc=0, scale=1./np.sqrt(n))
    elif method == "exact":
        rv = SampleCorrelationOfTwoIndependentNormals(n=n)
    else:
        raise ValueError, "unsupported method='%s'" % method
    p =  (1. - rv.cdf(rho))
    E = 1. / p
    return E
    


rhos = np.linspace(0, 0.99, 100)
Ks_approx = map(lambda x: task4_find_analytical(x, n=12, method="approximate"), rhos)
Ks_exact = map(lambda x: task4_find_analytical(x, n=12, method="exact"), rhos)

df1 = pd.DataFrame({"method": "approximate", "rho": rhos, "K": Ks_approx})
df2 = pd.DataFrame({"method": "exact", "rho": rhos, "K": Ks_exact})
df3 = pd.DataFrame({"method": "simulated", "rho": rhos[:len(Ks_simulated)], "K": Ks_simulated})
df = pd.concat([df1, df2, df3], ignore_index=True)
df

print task4_find_analytical(rho=0.97, n=12, method="exact")

#f1 = hv.Curve((rhos, Ks_approx), kdims="rho", vdims="K", label="approximate").redim.range(rho=(0, 1))
#f2 = hv.Curve((rhos, Ks_exact), kdims="rho", vdims="K", label="exact").redim.range(rho=(0, 1))
#f3 = hv.Curve((rhos[:len(Ks_simulated)], Ks_simulated), kdims="rho", vdims="K", label="simulated").redim.range(rho=(0, 1))

d_rho = hv.Dataset(df, vdims=["K"], kdims=["rho", "method"])

d_rho.to.curve(groupby="method").overlay().redim.range(rho=(0, 1)) * hv.VLine(0.97)

#f1 * f2 * f3

10990382.1224


Unnamed: 0,K,method,rho
0,2.000000e+00,approximate,0.00
1,2.056839e+00,approximate,0.01
2,2.116928e+00,approximate,0.02
3,2.180477e+00,approximate,0.03
4,2.247710e+00,approximate,0.04
5,2.318867e+00,approximate,0.05
6,2.394208e+00,approximate,0.06
7,2.474014e+00,approximate,0.07
8,2.558586e+00,approximate,0.08
9,2.648249e+00,approximate,0.09


In [None]:
n = 12
rho = 0.97


def task4_find_K_of_rho(rho, n):
    rv = stats.norm()
    x = rv.rvs(n)
    i = 0
    while 1:
        i += 1000
        y = rv.rvs(n)
        #print x
        #print y
        c = np.corrcoef(x, y)[0, 1]
        #print c
        if c > rho:
            return i
        if i > 1e6:
            raise Exception, "ep"

task4_find_K_of_rho(0.9, n)

# Задание 6

In [56]:
def task6_sample_bivariate_normal(n, m):
    dfs = []
    rv0 = SampleCorrelationOfTwoIndependentNormals(rho=0.0, n=N)
    for rho in [0.0, 0.2]:
        rv = SampleCorrelationOfTwoIndependentNormals(rho=rho, n=n)
        df = pd.DataFrame()
        df["x"] = rv.rvs(size=m)
        df["p"] = 1. - rv0.cdf(df["x"])
        df["rho"] = rho
        dfs.append(df)
    
    df_res = pd.concat(dfs)
    return df_res
 
def apply_bh_correction(pvalues):
    pvalues = np.array(pvalues)
    m = len(pvalues)
    res = -np.ones_like(pvalues)
    res[-1] = min(1., pvalues[-1])
    for k in range(len(res) - 1)[::-1]:
        res[k] = min(1., res[k+1], pvalues[k] * float(m) / (k+1))
    assert (pvalues >= 0).all()
    assert (res >= 0).all()
    return res
    
df = task6_sample_bivariate_normal(n=100, m=1000).reset_index(drop=True)
df = df.sort_values("p")
df["p_bon"] = np.minimum(1. , df["p"] * float(len(df)))
#df["p_bon"] = np.minimum(1. , df["p"] * len(df))
df["p_bh"] = apply_bh_correction(df["p"])
#print pd.concat([df.head(10), df.tail(10)])

def compute_stats(df, alpha):
    df0 = df[df["rho"] == 0.0]
    dfA = df[df["rho"] == 0.2]
    print df0.shape
    print dfA.shape

    data = []
    for col in ["p", "p_bon", "p_bh"]:
        false_positives = (df0[col] < alpha).sum()
        data.append(("false_positives", col, false_positives))
        data.append(("true_negatives", col, len(df0) - false_positives))

        
    for col in ["p", "p_bon", "p_bh"]:
        false_negatives = (dfA[col] > alpha).sum()
        data.append(("false_negatives", col, false_negatives))
        data.append(("true_positives", col, len(dfA) - false_negatives))
        
    df_stats = pd.DataFrame(data, columns=["error", "adjustment", "count"])
    df_stats = df_stats.pivot_table(index="error", values="count", columns="adjustment")
    
    return df_stats
    
df = compute_stats(df, 0.05)
df

(1000, 5)
(1000, 5)


adjustment,p,p_bh,p_bon
error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
false_negatives,378,649,967
false_positives,45,8,0
true_negatives,955,992,1000
true_positives,622,351,33


In [266]:
ITER = 1000 # number of experiments


N = 100

data = []
for i in range(ITER):
    x = task6_sample_bivariate_normal(rho=0.2, n=N)
    data.append(x)
    x = task6_sample_bivariate_normal(rho=0.0, n=N)
    data.append(x)

    
df = pd.DataFrame(data)
df.head()

Unnamed: 0,rho,slide8a,slide8b
0,0.2,0.812337,0.30725
1,0.0,0.989438,0.061589
2,0.2,0.877234,0.143168
3,0.0,1.022884,0.029467
4,0.2,0.674198,0.218885


In [301]:
rv1 = SampleCorrelationOfTwoIndependentNormals(rho=0.0, n=N)
rv2 = SampleCorrelationOfTwoIndependentNormals(rho=0.2, n=N)

x = rv1.rvs(size=ITER)
y = rv2.rvs(size=ITER)

In [302]:
hv.Distribution(x) * hv.Distribution(y)