In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pattrex.plotting_mpl as plt_rex
import pattrex.preprocessing as pre_rex
import pattrex.fitting as fit_rex

from pattrex.demo_helper import read_whdata

In [3]:
# Read data, remove and save outliers
ws, hs, gs = read_whdata()
HW = np.vstack((hs, ws)).astype(np.float)
HW_new, neg_idx = pre_rex.only_all_positive(HW, True, return_neg_idx=True)
H_unknown = np.array([h for i, h in enumerate(hs) if i in neg_idx])



In [11]:
# fit normal distribution to height and weight
h_new = HW_new[0, :]
h_mean, h_std, _, _ = fit_rex.fit_normal_distribution(h_new)
w_new = HW_new[1, :]
w_mean, w_std, _, _ = fit_rex.fit_normal_distribution(w_new)

print("height: {}, {}, {}".format(h_mean, h_std, np.power(h_std, 2)))
print("weight: {}, {}, {}".format(w_mean, w_std, np.power(w_std, 2)))

height: 173.57142857142858, 7.241227392483947, 52.435374149659864
weight: 71.52380952380952, 14.45722441786392, 209.01133786848075


In [43]:
# Bi-variate Gaussian
print(np.mean(HW_new, axis=1))
print(np.var(HW_new, axis=1))
print(np.cov(HW_new))
np.cov(HW_new, ddof=0)

[ 173.57142857   71.52380952]
[  52.43537415  209.01133787]
[[  55.05714286   89.08571429]
 [  89.08571429  219.46190476]]


array([[  52.43537415,   84.84353741],
       [  84.84353741,  209.01133787]])

### effect of `ddof`

In [19]:
np.cov(h_new)

array(55.05714285714286)

In [20]:
x = [-2.1, -1,  4.3]
np.var(x)

7.8066666666666658

In [24]:
np.cov(x, ddof=0)

array(7.806666666666666)

In [22]:
np.mean(x)

0.39999999999999991

In [23]:
np.sum(np.power((x - np.mean(x)), 2) * (1/3))

7.8066666666666658

***

In [54]:
# Let us proceed with the default behaviour of 
bi_mean = np.mean(HW_new, axis=1)
bi_cov = np.cov(HW_new, ddof=0)

bi_sqrt = np.sqrt(bi_cov)
bi_std = np.array([bi_sqrt[0, 0], bi_sqrt[1, 1]])
bi_rho = bi_cov[0, 1]/(np.prod(bi_std))

print(bi_std)
bi_rho

[  7.24122739  14.45722442]


0.81044147676186351

In [34]:
np.corrcoef(HW_new)

array([[ 1.        ,  0.81044148],
       [ 0.81044148,  1.        ]])

### Calculate values for outliers

In [56]:
# ddof = 0
W_unknown = bi_mean[1] + bi_rho * (bi_std[1]/bi_std[0]) * (H_unknown - bi_mean[0])
W_unknown

array([ 62.50890849,  68.98114513,  60.89084933])

In [53]:
W_unknown = bi_mean[1] + bi_rho * (bi_std[1]/bi_std[0]) * (H_unknown - bi_mean[0])
W_unknown

array([ 62.50890849,  68.98114513,  60.89084933])