# Extrapolation Detection (4.3)
Ant Ngo

In [9]:
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os

In [10]:
exec(open("../../header.py").read())

**Model distribution:**

$Y = 10X^{2}_{1} + \mathbb{1}_{X_{2} \geq 0} + \varepsilon$,

$\varepsilon \overset{\text{iid}}{\sim} \mathcal{N}(0, .1^{2}), \begin{bmatrix}
X_1\\
X_2
\end{bmatrix}
\sim \begin{cases}
U(-1,0), U(-1,0) &\text{w.p. $\frac{1}{3}$}\\
U(0,1), U(-1,0) &\text{w.p. $\frac{1}{3}$}\\
U(-1,0), U(0,1) &\text{w.p. $\frac{1}{3}$}\\
U(0,1), U(0,1) &\text{w.p. $0$}\\
\end{cases}$

In [11]:
def extrap_ex_sim(N, seed = None):
    '''
    #' Generates simulated data in 
    #' Section 4.3 Extrapolation Detection
    #' @param N Number of samples to generate.
    #' @param seednum Random seed for reproducibility.
    #' @return Simulated data (dataframe) with N observations.
    #' @examples
    #' extrap_ex_sim(1000,1)
    #' extrap_ex_sim(3000,420)
    '''
    # force seed if not defined
    if seed is None:
        np.random.seed(seed)
    
    def simulate():
        '''
        #' Helper to simulate one observation
        #' @return Single randomly generated observation
        #' based on 4.3 distribution.
        '''

        # generate random probability (0-1)
        s = np.random.uniform(low = 0, high = 1)

        # get distribution for x1, x2
        if s < (1/3):
            x1 = np.random.uniform(low = -1, high = 0, size = 1)
            x2 = np.random.uniform(low = -1, high = 0, size = 1)
        else:
            if s < (2/3):
                x1 = np.random.uniform(low = -1, high = 0, size = 1)
                x2 = np.random.uniform(low = 0, high = 1, size = 1)
            else:
                x1 = np.random.uniform(low = 0, high = 1, size = 1)
                x2 = np.random.uniform(low = -1, high = 0, size = 1)

        # generate single observation
        single_obs = np.concatenate((x1,x2), axis = 0)

        return single_obs
 
    
    # set distribution params
    b1, b2, sd = 10, 1, 0.1
    
    # matrix initialization
    X = np.full([N, 2], None)
    
    # simulate feature matrix
    for i in range(0,N):
        X[i,] = simulate()
        
    # create epsilon (noise)
    noise = sp.stats.norm.rvs(size=N)
    
    # generate Y
    y = b1 * (X[:,0]**2) + b2 *(X[:,1]>0) + noise
    y = y.reshape(-1,1)
    
    # concat feature matrix and target vector
    df = np.concatenate([X,y], axis = 1)
    
    # convert to df
    df = pd.DataFrame(df, columns = ['x1','x2','y'])
    
    return df

In [12]:
df = extrap_ex_sim(1000)
df.head()

Unnamed: 0,x1,x2,y
0,0.610726,-0.54074,5.24315
1,-0.110693,-0.830327,-0.864087
2,0.655724,-0.0714597,5.31649
3,-0.0854465,-0.50566,0.0166089
4,-0.466717,-0.764031,3.17603


In [13]:
X = df[['x1','x2']]
b1, b2, sd = 10, 1, 0.1
noise = sp.stats.norm.rvs(size=1000)
y = b1 * (df.x1**2) + b2 * (df.x2 > 0) + noise

# Save

In [14]:
folder_name = 'extrapolation_detection'

In [15]:
try:
    os.mkdir(processed_root(folder_name))
except FileExistsError:
    print("Folder already exists")

In [16]:
X.to_csv(processed_root(f"{folder_name}/X.csv"), index = False)
y.to_csv(processed_root(f"{folder_name}/y.csv"), index = False)