## Exercício Proposto na disciplina de ML

### 1. Porposition

We must:

 - Simulate the difference between Error in and Error out from a (pre-established) universe and a well-defined and finite set of hypotheses.
 - Check the validity of the theorem: $P[|E_{in}(g)-E_{out}(g)|> \epsilon]\leq 2 M e^{-2\epsilon ^2N}$, that assures us that $E_{in}(g) \approx E_{out}(g)$ so we can use $E_{in}(g)$ as a proxy for $E_{out}(g)$.

In [None]:
def draw_scatter_plot (X, y):
    import numpy as np
    from matplotlib import pyplot as plt
    X=np.asarray(X)
    y=np.asarray(y)
    plt.figure(figsize=(10,10))
    plt.axis('equal')  #<-- set the axes to the same scale
    plt.scatter(X[y==-1, 0], X[y==-1,1], c='red') 
    plt.scatter(X[y==1, 0], X[y==1,1], c='blue') 
    plt.title ("Data Points")
    plt.xlabel (r"$x_1$")
    plt.ylabel (r"$x_2$")
    plt.grid(True)
    plt.legend(['Negative Class', 'Positive Class'],loc='upper left')

    ax = plt.gca()
    return ax

In [None]:
def draw_vector (origin,w, ax):
    # u = origin
    # w = vector
    u = np.array(origin)
    w = np.array(w)
    v = w+u
    ax.annotate('',xy=v,xytext=u,arrowprops={"width":0.8,"headwidth":5,'headlength':7, 'color':'black'})
    ax.plot (v[0], v[1], "ow")
    
    return None

In [None]:
def draw_linear (w, b, ax, style=None):
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    if style == None:
        style = "-k"

    w = np.array(w)
    x_axis = np.linspace(xlim[0], xlim[1],100)
    y_axis = (-b - w[0]*x_axis) / w[1]
    y_lst = []
    x_lst = []
    
    for i,y in enumerate(y_axis):
        if ylim[0] <= y <= ylim[1]:
            y_lst.append(y)
            x_lst.append(x_axis[i])
        if len (x_lst) > 0:
            ax.plot(x_lst,y_lst, style)
        
        #return x_lst[-1], y_lst[-1]
    return None, None

### 2. Standardization 

The preprocessing of data in ML, usually involves standardizing the data. That is, perform a linear transformation so that the center (mean value) is zero and the standard deviation is 1.

The Universe must be standardized as well as the sample (resulting from sampling() function that takes the universe as paramater), even if this has already been done with the Universe, since the mean and standard deviation of the sample may not be the same values of the Universe.

In [None]:
def standardization(X):
    '''
        Given a np.array X returns X_std: mean = 0, 
        std = 1 (not inplace - pure function)
    '''
    import numpy as np
    X = np.asarray(X)
    
    #X_std = np.empty(X.shape)
    #X_std[:,0] = (X[:,0]-np.mean(X[:,0]))/np.std(X[:,0])
    #X_std[:,1] = (X[:,1]-np.mean(X[:,1]))/np.std(X[:,1])
    
    return  (X - X.mean(axis=0))/ X.std(axis=0)

    return X_std

In [None]:
#Testing Standarization
import numpy as np

X = np.asarray([[0,0],[1,0],
 [3,1],[3,2],[4,4]])
X1=X[:,0]
X2=X[:,1]
X_std = standardization(X)

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10,10))
plt.axis('equal')  #<-- set the axes to the same scale
plt.scatter(X[:,0],X[:,1])
plt.title ("Data Points")
plt.xlabel (r"$x_1$")
plt.ylabel (r"$x_2$")
plt.grid(True)


In [None]:
plt.figure(figsize=(10,10))
plt.axis('equal')  #<-- set the axes to the same scale
plt.scatter(X_std[:,0],X_std[:,1])
plt.title ("Data Points")
plt.xlabel (r"$x_1$")
plt.ylabel (r"$x_2$")
plt.grid(True)


### 3. Error calculation (calc_error)

This funciton will be useful to calculate $E_{in}$ and also $E_{out}$.
The error must be normalized by the number of elements (be it the Universe or the sample sizes).

In [None]:
def calc_error(Y,Y_hat):
    '''
        Given Y (labels) and Y (predicts) returns normalized error

        Inputs:
        Y: np.array or list
        Y_hat: idem
    '''
    import numpy as np

    # Type fitting    
    Y=np.asarray(Y)
    Y_hat=np.asarray(Y_hat)
        
    #error = np.abs(Y_hat[np.abs(Y-Y_hat)>0])                            # error<-masked array of Y_hat
    #norm_error = (np.sum(np.abs(np.abs(Y-Y_hat)>0)))/len(Y)             # normalized error
 
    return (np.sum(np.abs(np.abs(Y-Y_hat)>0)))/len(Y) 

In [None]:
a = np.zeros(5)
b = np.ones(5)
Y = [1,1,1,-1,-1] 
Y_hat = [-1,1,-1,1,-1]

In [None]:
calc_error(a,a)

In [None]:
calc_error(Y,Y_hat)

### 4. Sampling

Sampling will take a N-size sample from the Universe, defined by de X array and its labels stored in Y. 

In [None]:
def sampling(N,X,Y,random_state=42):
    '''
        Given the arguments:
          - N: #of samples to be taken from (X,Y) : int,
        
          (X,Y): Universe
          - X: points in R² : np.array and
          - Y: labels for X: np.array {1,0}
        
        Returns:
          - sample_N: sample of N elements of (X,Y) : np.array 
    '''
    from numpy.random import RandomState
    
    # Type fitting
    X = np.asarray(X)
    
    j,k = X.shape
    
    if not j==len(Y):
        raise TypeError("X and Y must have the same number of lines") 
        
    # Defining the N-size random index of (X,Y)
    #rand_index = rs.randint(0, high=len(Y), size=N, dtype=int)
    rand_index = np.random.randint(0, high=len(Y), size=N, dtype='l')
    
     # Type fitting
    Y = np.asarray(Y)
    
    # Sample array with N lines and k+1 columns
    # X_sample = np.empty((N,k))
    # Y_sample = np.empty((N,1))
    
    # for i in range(0,N,1):
    #    X_sample[i,:] = X[rand_index[i],:]
    #    Y_sample[i,0] = Y[rand_index[i]]
    
    # Standarization of the sample
    # X_sample = standardization(X_sample)
  
    return (X[rand_index,:], Y[rand_index])

In [None]:
# Testing smapling()

X = np.asarray([[0,0],[1,0],
[3,1],[3,2],[4,4]])

X_sample,y_sample=sampling(3,X,Y,42)


### 5. Diagonals

Function of Diagonals: 45º straight lines (angular coefficient +1 and -1, varying 'bias' forwards and backwards - 'bias' step (b passed as a parameter) defined by the range $[-M/4,M/4]$)

We know that:
- $X_0 * w[0] + X1 * w[1] +bias = 0$ and
- $w = [1,1]$, represents the case of the line with negative slope and
- $w = [1,-1]$, the case of a line with a positive slope

The following order must be used:
bias from $-(M/4)*b$ to $(M/4)*b$ (exclusive).

The line with negative slope (coef == -1), vector $w = [1,1]$ (perpendicular to the straight line), and bias is calculated first and then the line with positive slope,vector $w = [1,-1]$, and the same bias.

In [147]:
def diagonais(X,M,b): # valor:2.5
  '''
    Função Diagonais: retas 45º (coeficiente angular +1 e -1  variando bias 
    um tanto para frente e um tanto para trás - passo do bias (b passado por parâmetro) 
    definido pelo intervalo [-M//4,M//4)

    Sabendo que: 
      x0 * w[0] + x1 * w[1] + bias = 0 e que
      w = [1,1] no caso da reta com inclinação negativa e
      w = [1,-1] no caso da reta com inclinação positiva

    A seguinte ordem deve ser utilizada:
      bias partindo de -(M//4) * b até (M//4) * b (exclusive)
      A reta com inclinação negativa (coef == -1), vetor w = [1,1] (perpendicular a reta), e bias é calculda primeiro 
      e a na sequência reta com inclinação positiva, vetor w = [1,-1], e o mesmo bias.
      Conforme mostrado nos plots!

	parâmetros:
		X: np.array
		M: número de hipóteses do universo (número inteiro) - espera-se um múltiplo de 4
	Retorna 
		predict: np.array de np.array de y_hat, um y_hat para cada hipótese (reta), deve ter tamanho M
   '''
  import numpy as np
  X = np.asarray(X)
  
  j,k = X.shape

  # Initial bias value
  bias = -(M/4)*b

  predicts = np.empty((M,j), dtype = 'l')
  i=0       # predicts index
    
    
  while bias<(M/4)*b:
    
      # Negative slope
      w = [1,1]
      predicts[i,:] = np.sign(X[:,0]*w[0] + X[:,1]*w[1] + bias)
      i+=1
      
      # Positive slope
      w = [1,-1]
      predicts[i,:] = np.sign(X[:,0]*w[0] + X[:,1]*w[1] + bias)
      i+=1
      
      bias = bias+1
 
  predicts[predicts==0]=1
  
  return predicts

In [None]:
# Testing 1 diagonals

M=4
b=1

predicts=diagonais(X_std,4,1)


In [None]:
b = -(M/4)*b

for i in range(0,M):
    #i=1
    print("bias:\n",b)
    ax = draw_scatter_plot (X_std, predicts[i,:])

    
    if (i%2==0):
        w = [1, 1]
        draw_linear (w, b, ax)
        
    else:
        w = [1, -1]        
        draw_linear (w, b, ax)
        b+=1
    

    # Pick up a point on the decision boundary
    u = np.array([0.,0.])
    u[0] = 0
    u[1] = 0
    #u[1] = (-b - w[0]*u[0]) / w[1]
    draw_vector (u, w, ax) # vector w at a point in the hyperplane
    
    

### 6. Euclidean Distance (euclidean_dist)

In [None]:
def euclidean_dist(p,q): # valor:0.5
  '''
    Given two points (np.array) returns the euclidean distance between them
  '''
  import numpy as np
  
  # Type fitting
  p=np.asarray(p)
  q=np.asarray(q)
   
 # dist = (np.sum(np.power((p-q),2)))**(1/2)
  dist = np.power(np.sum(np.power((p-q),2)),1/2)

  return dist

In [None]:
euclidean_dist
P = [6.0, 3.0]
Q = [3.0, 7.0]
print(euclidean_dist(P, Q))

### 7. Egocentric 

In [None]:
def egocentric(X,C,r): # valor:2.0
  '''
    Given a dataset X (np.array), C (np.array) are the points that will be used as centers, and a radius r: 
      For each point in C, Creates a circumference c, each center works as an hypothesis, and classify points inside c as +1
      otherwise -1.
      Returns all predicts (an list for each point (used as center) )
  '''
  # Type fitting
  X = np.asarray(X)
  C = np.asarray(C)
  r = np.float(np.abs(r))
  
  j,k = X.shape
  cj,ck = C.shape
  
  # Number of rows = # of hypothesis = # of points in C array
  # Number of columns in 'predicts' = # labels for 'X' (each point has 1 label {+1,-1})
   
  predicts = np.empty((cj,j), dtype = 'l')
  
    
  for i in np.arange(0,cj,1):
      for p in  np.arange(0,j,1):
          dist = euclidean_dist(C[i],X[p])
          predicts[i,p] = np.abs(dist<r)
    
    
  
  predicts = np.asarray(predicts)
  predicts[predicts==0]=-1    
  
  return predicts

In [None]:
r=1.1478385915666818

In [None]:
X = np.asarray([[0,0],[1,0],[3,1],[3,2],[4,4]])
Y = np.asarray([-1,-1,1,1,1])
N = 4

X_std = standardization(X)
X_sampled_std,Y_sampled = sampling(N,X_std,Y,random_state=42)

In [None]:
egocentric(X_std,X_sampled_std,1)

### 8. Frequency Calculation (calc_freq)

In [148]:
def calc_freq(N,H_set,eps,X,Y,M=100,b=0.05,r=1,random_state = 42): # valor:3.0
  '''
  Given N # of samples(integer), H_set name of the hypotheses set 
  (string <diagonais> or <egocentric> error will be returned otherwise)
  eps: epsilon (abs(error_in - error_out) desired), X from the Universe data (np.array - complete dataset),
  Y is all label from theentire Universe(np.array), M # of hypotheses used if <diagonais> is chosen, 
  B: is the bias used when <diagonais> is chosen, r radius of the circumference if <egocentric> is chosen, 
  random_state to set the seed

  Returns:
    bound: theoretical bound for Pr[abs(error_in - error_out) > eps]
    probs: approximated probability of Pr[abs(error_in - error_out) <= eps] by the frequency 
      (# of occurancies (abs(error_in - error_out) <= eps) / # of hipotheses)
  '''
  import re 
  import numpy as np

  # Type fitting    
  Y = np.asarray(Y)
  X = np.asarray(X)
    
  # Sampling
  X_std = standardization(X)
  X_sampled,Y_sampled = sampling(N,X,Y,random_state)
  X_sampled_std = standardization(X_sampled)

  Y_sampled[Y_sampled==0]=-1
  
  # Generating the hypothesis set
  if re.match('diagonais', H_set, flags=re.IGNORECASE):
      Y_hat = diagonais(X_sampled_std,M,b)
      H_set_size = M
      Y_test = diagonais(X_std,M,b)
        
  elif re.match('egocentric', H_set, flags=re.IGNORECASE):
      Y_hat = egocentric(X_sampled_std,X_sampled_std,r)
      H_set_size = N
      Y_test = egocentric(X_std,X_sampled_std,r)
  
    
  # Error arrays 
  error_in = np.zeros((1,H_set_size), dtype=np.float64)
  error_out = np.zeros((1,H_set_size), dtype=np.float64)
    
  # Sacanning the entire hypothesis set to calculate the errors
  for i in np.arange(0,H_set_size,1): 
      error_in[:,i] = calc_error(Y_sampled, Y_hat[i,:])
      error_out[:,i] = calc_error(Y, Y_test[i,:])
    
    
  # Theoretical bound for P[|E_in-E_out|>eps]
  bound = 2*H_set_size*np.exp(-2*(eps**2)*N)
    
  # Array of diferences between E_in and E_out for each hypothesis
  difference = np.abs(error_out-error_in)
  
  print(np.abs(difference[difference<=eps]))
  
  difference[difference<=eps] = np.ma.masked
    
  #  Pr[abs(error_in - error_out) <= eps]
  probs = np.sum(np.ma.count(difference))/H_set_size
 
    
  return (bound,probs)


In [149]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [150]:
seed = 42
X,Y = make_classification(n_samples = 3000, 
                          n_classes = 2,
                          n_features = 2, 
                          n_redundant = 0,
                          n_informative = 2, 
                          n_clusters_per_class = 1,
                          n_repeated = 0, hypercube = True,
                          weights = [0.3,],
                          random_state = seed)
eps = 0.1 
N = 700
M = 500
b = 1

Y[Y==0]=-1

In [151]:
bound,probs = calc_freq(N,"diagonais",eps,X,Y,M,b)
print("(bound,probs):",(bound,probs))

500
(bound,probs): (0.0008315287191035649, 1.0)


In [152]:
bound,probs = calc_freq(N,"egocentric",eps,X,Y,b)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  r = np.float(np.abs(r))


700


In [122]:
print("(bound,probs):",(bound,probs))

(bound,probs): (0.0011641402067449908, 1.0)


In [106]:
X_sampled,Y_sampled = sampling(N,X,Y)
X_std = standardization(X_sampled)
predicts = diagonais(X_std,M,b)

In [107]:
print(X_std.shape)
print(Y_std.shape)
print(predicts.shape)

(700, 2)


NameError: name 'Y_std' is not defined

In [None]:
for i in range(0,M):
    #i=1
    print("bias:\n",b)
    ax = draw_scatter_plot (X_std, predicts[i,:])

    
    if (i%2==0):
        w = [1, 1]
        draw_linear (w, b, ax)
        
    else:
        w = [1, -1]        
        draw_linear (w, b, ax)
        b+=1
    

    # Pick up a point on the decision boundary
    u = np.array([0.,0.])
    u[0] = 0
    u[1] = 0
    #u[1] = (-b - w[0]*u[0]) / w[1]
    draw_vector (u, w, ax) # vector w at a point in the hyperplane
    

In [None]:
ax = draw_scatter_plot (X_std,Y_sampled)