# Bias, Variance & Noise

Error is equal to Bias squared plus Variance plus irreductible error.  
  
$\large Y = f(X) + \varepsilon $

$\large Err = E[(Y - \hat{f}(X))^2] = [Y - E[\hat{f}(X)]]^2 + E[\hat{f}(X) - E[\hat{f}(X)]]^2 + \varepsilon $

 ![](biasvariance.png)

## Imports

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor

## Calculate MSE

$\large \frac{1}{M} \sum_{i=1}^M (y_i-\hat{y_i})^2$

In [2]:
def calc_mse(yh):
    y_ = [y] * nSim
    m = np.mean((y_ - yh)**2)
    return m

## Calculate Bias
$\large (\frac{1}{M} \sum_{i=1}^M \hat{y_i}) - y_i $

In [3]:
def calc_bias(yh):
    bias = np.mean(yh) - y
    return bias
 

## Calculate Variance
$\large \mu = \frac{1}{M} \sum_{i=1}^M \hat{y_i}) $  
  
$\large \frac{1}{M} \sum_{i=1}^M (\hat{y_i} - \mu)^2 $

In [4]:
def calc_var(yh):
    var = np.mean((yh - np.mean(yh))**2)
    return var

## Testing

In [5]:
def get_population():
    size=5000
    x1 = np.random.rand(size)
    x2 = np.random.rand(size)
    x3 = np.random.rand(size)
    x4 = np.random.rand(size)
    x5 = np.random.rand(size)
    
    b0= 1.1; b1=2.2; b2 = 3.3; b3= 4.4; b4=5.5; b5= 6.6
    y = b0 + b1*x1 + b2*(x2**2) + b3*(x3*x4) + b4*x4 + b5*x5 
    random_state=1234
    r = np.random.RandomState(random_state)
    noise = r.normal(-5, 10, size)
    y = y + noise
    df = pd.DataFrame({'target':y, 'X1':x1, 'X2':x2, 'X3':x3, 'X4':x4, 'X5':x5})
    return df

In [14]:
def calculate(lm, dt):
    Pred = []
    for i in range(0, nSim):
        D = get_population()
        X = D[['X1', 'X2', 'X3', 'X4', 'X5']]
        Y = D['target']
        lm.fit(X,Y)
        dt.fit(X,Y)
        pred = (i, lm.predict(pd.DataFrame(X_test).T), dt.predict(pd.DataFrame(X_test).T))
        Pred.append(pred)
    return pd.DataFrame(Pred)

def evaluate(pred):
    mse_lm = calc_mse(pred[1])[0]
    bias_lm = (calc_bias(pred[1])[0])**2
    var_lm = calc_var(pred[1])[0]
    mse_dt = calc_mse(pred[2])[0]
    bias_dt = (calc_bias(pred[2])[0])**2
    var_dt = calc_var(pred[2])[0]
    print('Mod\t Mse\t Bias\t Var')
    print('LM\t',"{0:.1f}".format(mse_lm), '\t', "{0:.1f}".format(bias_lm), '\t', "{0:.1f}".format(var_lm))
    print('DT\t',"{0:.1f}".format(mse_dt), '\t', "{0:.1f}".format(bias_dt), '\t', "{0:.1f}".format(var_dt))
  

In [15]:
np.random.seed(22)
X_test = np.random.rand(5)
y = get_population()['target'][0]
   
nSim = 100
for dt_depth in [3,4,6,8,9,10]:
    print("\n Depth: ", dt_depth, '\n')
    lm = linear_model.LinearRegression()
    dt = DecisionTreeRegressor(max_depth = dt_depth)
    
    result = calculate(lm, dt)
    evaluate(result)


 Depth:  3 

Mod	 Mse	 Bias	 Var
LM	 86.5 	 86.5 	 0.1
DT	 79.0 	 77.4 	 1.6

 Depth:  4 

Mod	 Mse	 Bias	 Var
LM	 86.4 	 86.3 	 0.1
DT	 81.2 	 78.3 	 2.9

 Depth:  6 

Mod	 Mse	 Bias	 Var
LM	 86.1 	 86.0 	 0.1
DT	 90.3 	 86.8 	 3.5

 Depth:  8 

Mod	 Mse	 Bias	 Var
LM	 86.3 	 86.2 	 0.1
DT	 101.2 	 89.6 	 11.6

 Depth:  9 

Mod	 Mse	 Bias	 Var
LM	 85.9 	 85.9 	 0.1
DT	 108.3 	 92.2 	 16.1

 Depth:  10 

Mod	 Mse	 Bias	 Var
LM	 86.7 	 86.7 	 0.1
DT	 101.1 	 84.6 	 16.5


## Credits & Links

https://towardsdatascience.com/end-your-bias-about-bias-and-variance-67b16f0eb1e6
https://towardsdatascience.com/understanding-the-bias-variance-tradeoff-165e6942b229
https://www3.nd.edu/~rwilliam/stats1/x12.pdf