# Tarea 1: Regresión
- Martínez Ostoa Néstor Iván
- Aprendizaje de Máquina

1) Descargar el conjunto de datos de: https://archive.ics.uci.edu/ml/datasets/Airfoil+Self-Noise

2) Desarrollar en python regresión polinomial con regularización

    2.1) Con solución analítica

    2.2) Con solución a través de gradiente descendente
        
3) Seleccionar de forma aleatoria el 20% de los datos para el conjunto de validación
   
4) Con el 80% restante de los datos. Utilizar 5-fold crossvalidation en la regresión con  solución analítica y con gradiente descendente para encontrar un buen valor de delta (coeficiente de la regularización)


5) Probar el modelo que resulte mejor de acuerdo a la metodología minmax en el conjunto de validación y reportar el error encontrado

In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
def plot_y(x, y, title, mode='lines'):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=y, mode=mode))
    fig.update_layout(title=dict(text=title))
    fig.show()

def plot_two_ys(y, y_predict, title):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(y.shape[0]),y=y,mode='lines',marker_color='red',name='y'))
    fig.add_trace(go.Scatter(x=np.arange(y.shape[0]),y=y_predict,mode='lines',marker_color='blue',name='y_pred'))
    fig.update_layout(title=dict(text=title))
    fig.show()

In [3]:
df = pd.read_csv('airfoil_self_noise.dat', sep='\t', names=['Frequency', 'Angle of attack', 'Chord length', 'Free-stream velocity', 'Suction side displacement thickness', 'Scaled sound pressure level'])
print(df.shape)
df.head()

(1503, 6)


Unnamed: 0,Frequency,Angle of attack,Chord length,Free-stream velocity,Suction side displacement thickness,Scaled sound pressure level
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [4]:
df.describe()

Unnamed: 0,Frequency,Angle of attack,Chord length,Free-stream velocity,Suction side displacement thickness,Scaled sound pressure level
count,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0
mean,2886.380572,6.782302,0.136548,50.860745,0.01114,124.835943
std,3152.573137,5.918128,0.093541,15.572784,0.01315,6.898657
min,200.0,0.0,0.0254,31.7,0.000401,103.38
25%,800.0,2.0,0.0508,39.6,0.002535,120.191
50%,1600.0,5.4,0.1016,39.6,0.004957,125.721
75%,4000.0,9.9,0.2286,71.3,0.015576,129.9955
max,20000.0,22.2,0.3048,71.3,0.058411,140.987


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1503 entries, 0 to 1502
Data columns (total 6 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Frequency                            1503 non-null   int64  
 1   Angle of attack                      1503 non-null   float64
 2   Chord length                         1503 non-null   float64
 3   Free-stream velocity                 1503 non-null   float64
 4   Suction side displacement thickness  1503 non-null   float64
 5   Scaled sound pressure level          1503 non-null   float64
dtypes: float64(5), int64(1)
memory usage: 70.6 KB


In [6]:
x = np.arange(df.shape[0])
y = df.iloc[:, -1]
plot_y(x, y, title='Scaled sound pressure level')

## 2. Regresión polinomial con regularización

Función costo:

$J(\theta) = (y-X\theta)^T(y-X\theta) + \delta^2\theta^T\theta$


Gradiente: 

$\nabla J(\theta) = -2X^Ty + 2\theta(X^TX + \delta^2)$

In [7]:
def cost(X, y, theta, delta):
    return ((y-X@theta).T@(y-X@theta) + theta**2*theta.T@theta)/X.shape[0]

def gradient(X, y, theta, delta):
    X = X.values
    y = y.values
    return -2*X.T@(y - (X@theta)) + 2*(delta**2)*theta

def gradient_descent(X, y, theta, delta, alpha, min_e=0.001, n_iter=1000):
    theta_k = theta
    for i in range(n_iter):
        step_size = alpha*gradient(X, y, theta_k, delta)
        theta_k = theta_k - step_size
    return theta_k

def analytical_theta(X, y, delta):
    return np.linalg.inv(X.T@X + delta**2)@X.T@y

def analytical_report(X, y, delta):
    theta_analytical = analytical_theta(X, y, delta)
    y_pred = X.values@theta_analytical
    title = f'R2 score with analytical theta: {np.round(r2_score(y, y_pred),6)}'
    plot_two_ys(y, y_pred, title)

## 3. Train test split

En este punto dividimos en ```train``` y ```test``` y aunado a esto exploramos con las siguientes opciones:
- Estandarización de los datos
- Regresión polinomial

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
def add_intercept_ones(df):
    df.insert(0, 'Intercept', np.repeat(1, df.shape[0]))
    return df

def generate_polynomial_data(df, degrees, cols):
    """
    Adds the 'cols' columns elevated to some degree (X^2, X^3, etc) to the original
    dataframe

    cols: list
        indicates the colum indices to include in the final dataset
    """
    original_cols = df.columns
    new_cols = [f'X{idx}' for idx in range(df.shape[1])]
    df.rename(columns=dict(zip(original_cols, new_cols)), inplace=True)
    for d in degrees:
        deg_data = df.iloc[:, cols].values**d
        temp_df = pd.DataFrame(data=deg_data, columns=[f'X{idx}^{d}' for idx in range(len(cols))])
        df = pd.concat([df, temp_df], axis=1)
    return df 

def standarize_data(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)


### Obtención del grado polinomial

In [10]:
degrees = [2]
print("--------GRADO POLINOMIAL--------")
print("   Grado del polinomio:", max(degrees))
print("--------------------------------")

--------GRADO POLINOMIAL--------
   Grado del polinomio: 2
--------------------------------


In [11]:
def generate_data(df):
    X = df.iloc[:, :-1]
    X = generate_polynomial_data(X, degrees=degrees, cols=np.arange(0, X.shape[1]))
    X = standarize_data(X)
    X = add_intercept_ones(pd.DataFrame(X))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    return (X_train, X_test, y_train, y_test)

In [12]:
X_train, X_test, y_train, y_test = generate_data(df)
print(X.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

NameError: name 'X' is not defined

## 4. $k$-cross validation + minmax

In [None]:
from sklearn.model_selection import KFold

In [None]:
def powermean(y, y_pred, p=2):
    n = y.shape[0]
    return ((1/n)*(np.sum((y-y_pred)**p)))**(1/p)

def custom_min(errors, deltas):
    min_error = 1e5
    min_delta = None
    for delta, error in zip(deltas, errors):
        if error < min_error:
            min_error = error
            min_delta = delta
    return min_error, min_delta

### Deltas

In [None]:
deltas = np.arange(0, 1, 1/5)
print("---------DELTAS A EVALUAR---------")
print("       Deltas a evaluar\n    ", deltas)
print("-----------------------------------")

---------DELTAS A EVALUAR---------
       Deltas a evaluar
     [0.  0.2 0.4 0.6 0.8]
-----------------------------------


### 4.1 Método analítico

In [None]:
X_train, X_test, y_train, y_test = generate_data(df)

kf = KFold(n_splits=5)
max_errors = []
i = 0
for train_index, test_index in kf.split(X_train):
    # K-fold splitting
    X_train_k, X_test_k = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_train_k, y_test_k = y_train.iloc[train_index], y_train.iloc[test_index]

    # Theta train and theta test
    theta_train = analytical_theta(X_train_k, y_train_k, deltas[i])
    theta_test = analytical_theta(X_test_k, y_test_k, deltas[i])

    # Train and test errors
    train_error = powermean(y_train_k, X_train_k.values@theta_train)
    test_error = powermean(y_test_k, X_test_k.values@theta_test)

    # Max error between train and test error
    max_errors.append(max(train_error, test_error))
    i += 1

# Min error
min_error, best_analytical_delta = custom_min(max_errors, deltas)
print(f'Min error: {min_error}\nBest delta for analytical method: {best_analytical_delta}')

Min error: 4.692576230163578
Best delta for analytical method: 0.4


In [None]:
theta_analytical = analytical_theta(X_test, y_test, delta=best_analytical_delta)
y_pred = X_test.values@theta_analytical
analytical_error = powermean(y_test, y_pred)
print("---------------MÉTODO ANALÍTICO---------------")
print('Error: ', analytical_error)
print("Grado del polinomio: ", max(degrees))
print("Delta: ", best_analytical_delta)
print("----------------------------------------------")

---------------MÉTODO ANALÍTICO---------------
Error:  4.242704241840365
Grado del polinomio:  2
Delta:  0.4
----------------------------------------------


### 4.2 Gradiente Descendente

In [None]:
X_train, X_test, y_train, y_test = generate_data(df)

kf = KFold(n_splits=5)
alpha = 0.0002
max_errors = []
i = 0
for train_index, test_index in kf.split(X_train):
    # K-fold splitting
    X_train_k, X_test_k = X_train.iloc[train_index, :], X_train.iloc[test_index, :]
    y_train_k, y_test_k = y_train.iloc[train_index], y_train.iloc[test_index]

    # Theta train and theta test
    # def gradient_descent(X, y, theta, delta, alpha, min_e=0.001, n_iter=1000):
    initial_theta = np.random.randn(X_train_k.shape[1])
    theta_train = gradient_descent(X_train_k, y_train_k, initial_theta, deltas[i], alpha, n_iter=100)
    theta_test = gradient_descent(X_test_k, y_test_k, initial_theta, deltas[i], alpha, n_iter=100)

    # Train and test errors
    train_error = powermean(y_train_k, X_train_k.values@theta_train)
    test_error = powermean(y_test_k, X_test_k.values@theta_test)

    # Max error between train and test error
    max_errors.append(max(train_error, test_error))
    i += 1

# Min error
min_error, best_gs_delta = custom_min(max_errors, deltas)
print(f'Min error: {min_error}\nBest delta for gradient descent: {best_gs_delta}')

Min error: 4.768945355404381
Best delta for gradient descent: 0.4


In [None]:
initial_theta = np.random.randn(X_train_k.shape[1])
theta_gs = gradient_descent(X_test, y_test, initial_theta, best_gs_delta, alpha, n_iter=100)
y_pred = X_test.values@theta_gs
analytical_error = powermean(y_test, y_pred)
print("---------------GRADIENTE DESCENDENTE---------------")
print('Error: ', analytical_error)
print("Grado del polinomio: ", max(degrees))
print("Delta: ", best_gs_delta)
print("----------------------------------------------")

---------------GRADIENTE DESCENDENTE---------------
Error:  4.416097393316273
Grado del polinomio:  2
Delta:  0.4
----------------------------------------------
