# Laboratorio 3: Regresión Lineal

*   Marco Ferraro | B82957

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

df = pd.read_csv('fish_perch.csv')
df

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
0,5.9,7.5,8.4,8.8,2.112,1.408
1,32.0,12.5,13.7,14.7,3.528,1.9992
2,40.0,13.8,15.0,16.0,3.824,2.432
3,51.5,15.0,16.2,17.2,4.5924,2.6316
4,70.0,15.7,17.4,18.5,4.588,2.9415
5,100.0,16.2,18.0,19.2,5.2224,3.3216
6,78.0,16.8,18.7,19.4,5.1992,3.1234
7,80.0,17.2,19.0,20.2,5.6358,3.0502
8,85.0,17.8,19.6,20.8,5.1376,3.0368
9,85.0,18.2,20.0,21.0,5.082,2.772


In [2]:
df_y = df['Weight'].copy()
df_x = df.drop(columns=['Weight'])
df_y

0        5.9
1       32.0
2       40.0
3       51.5
4       70.0
5      100.0
6       78.0
7       80.0
8       85.0
9       85.0
10     110.0
11     115.0
12     125.0
13     130.0
14     120.0
15     120.0
16     130.0
17     135.0
18     110.0
19     130.0
20     150.0
21     145.0
22     150.0
23     170.0
24     225.0
25     145.0
26     188.0
27     180.0
28     197.0
29     218.0
30     300.0
31     260.0
32     265.0
33     250.0
34     250.0
35     300.0
36     320.0
37     514.0
38     556.0
39     840.0
40     685.0
41     700.0
42     700.0
43     690.0
44     900.0
45     650.0
46     820.0
47     850.0
48     900.0
49    1015.0
50     820.0
51    1100.0
52    1000.0
53    1100.0
54    1000.0
55    1000.0
Name: Weight, dtype: float64

# 1. Función `MSE(y_true, y_predict)` 
Recibe dos objetos pd.Series que
contienen los valores reales de un conjunto de datos y los valores estimados por un modelo. Calcule y retorne el error cuadrático medio de dicha predicción.

In [3]:
def MSE(y_true, y_predict, c=[], regularization='none', lbd=0):
  sum = 0.0
  n = len(y_true)
  add_on = 0.0

  if regularization == 'lasso' or regularization == 'l1':
    sum_l1 = 0.0
    for i in range(len(c)):
      sum_l1 += abs(c[i])
    add_on = lbd * sum_l1

  elif regularization == 'ridge' or regularization == 'l2': 
    sum_l2 = 0.0
    for i in range(len(c)):
      sum_l2 += c[i]**2
    add_on = lbd * sum_l2

  for i in range(n):
    sum += ((y_predict[i] - y_true[i])**2)
  return float(sum / n) + add_on

# 2. Función `score(y_true, y_predict)`

Recibe dos objetos pd.Series que
contienen los valores reales de un conjunto de datos y los valores estimados por un
modelo. Calcule y retorne el coeficiente de determinación (R^2) de dicha predicción

In [4]:
def score(y_true, y_predict):

  y_true.to_numpy()
  y_predict.to_numpy()

  mean = y_true.mean()
  first_sum = sum(((y_true - y_predict)**2))
  second_sum = sum(((y_true - mean)**2))

  return float((second_sum - first_sum) / second_sum)

In [5]:
class LinearRegression:
  def __init__(self):
    self.c_vector = []
    self.errors = []
    self.past_dc = 0.0

  def update_c(self, x, y, c, learning_rate, momentum=0):

    a = (np.matmul(x, c) - y).T
    b = np.matmul(a, x).T
    dc = (2.0 / len(y)) * b

    new_c = c - (learning_rate * (dc + (momentum * self.past_dc))) 
    self.past_dc = dc

    return new_c

  def predict(self, x, add_bias=False):
    if (add_bias):
       x.insert(0, "bias", 1.0, True)
       x = x.to_numpy()
    predict_y = []
    for i in range(len(x)):
      predict_y.append(np.dot(x[i], self.c_vector)[0])
    
    return pd.Series(predict_y)

  def get_error_history(self):
    return self.errors

  def fit(self, x, y, max_epochs=1e5, threshold=1e-3, learning_rate=1e-5, momentum=0, decay=0, error='mse', regularization='none', lbd=0):
    
    ready = False
    error = 0
    iteration = 0
    self.errors = []
    
    x.insert(0, "bias", 1.0, True)
    x = x.to_numpy()
    y = np.array([y.to_numpy()]).T
    self.c_vector = np.array([random.sample(range(len(x[0])), len(x[0]))]).T

    while not ready and iteration < max_epochs:
      self.c_vector = self.update_c(x, y, self.c_vector, learning_rate, momentum=momentum)
      new_error = MSE(y, np.matmul(x, self.c_vector), c=self.c_vector, regularization=regularization, lbd=lbd)
      if abs(error - new_error) < threshold:
        ready = True
      error = new_error
      self.errors.append(error)
      learning_rate = learning_rate / (1 + decay)

      iteration += 1

# 3 Funcionamiento del algoritmo 
Utilice el set de datos proveído para probar el funcionamiento de su algoritmo. Recuerde que el error debe reducirse en cada iteración del algoritmo (o llegar a un “zig-zag” producto de una tasa de aprendizaje muy elevada).

In [6]:
lr = LinearRegression()
preds = lr.fit(df_x, df_y)

errors = lr.get_error_history()
reduced_errors = []
jumps = 2150

for i in range(0, len(errors), jumps): reduced_errors.append(errors[i])

reduced_errors

[126962.19681659035,
 46489.85138762738,
 45460.06570627436,
 44465.1558930999,
 43503.86625700528,
 42574.98700212322,
 41677.35254387599,
 40809.839886881105,
 39971.367062432226,
 39160.89162336704,
 38377.40919421352,
 37619.95207458383,
 36887.587893860014,
 36179.41831528674,
 35494.57778765642,
 34832.2323428381,
 34191.57843746596,
 33571.84183716545,
 32972.276541753854,
 32392.16374991055,
 31830.81086186687,
 31287.550518718588,
 30761.73967701612,
 30252.758717336004,
 29760.010585585947,
 29282.919965840127,
 28820.932483547575,
 28373.513937996668,
 27940.149562961964,
 27520.343314497626,
 27113.617184880528,
 26719.510541741827,
 26337.57949146285,
 25967.39626594257,
 25608.54863187942,
 25260.639321739403,
 24923.285485614666,
 24596.11816320475,
 24278.781775181385,
 23970.9336332246,
 23672.24346804535,
 23382.392974732244,
 23101.075374787626,
 22827.994994239085,
 22562.86685723601,
 22305.416294563,
 22055.378566521296]

# 4 Split de datos 
Luego utilice el método train_test_split de la biblioteca sklearn.model_selection para separar un conjunto de datos en un conjunto de datos de entrenamiento y otro de prueba, utilice de semilla del split el número 21 (el método permite el parámetro opcional trandom_state para sembrar la aleatoriedad).



> **Nota**: Para esta sección se hará el cálculo del r2 con scikit learn, ya que el modelo implementado genera `nan` en la primera resta




In [7]:
from sklearn.metrics import r2_score


X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(df_x, df_y, test_size=0.3, random_state=21)

lr = LinearRegression()
lr.fit(X_fish_train, y_fish_train)

pred = lr.predict(X_fish_test, add_bias=True)

r2 = r2_score(y_fish_test, pred)
r2

0.798438051881311

In [8]:
from sklearn.metrics import r2_score

X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(df_x, df_y, test_size=0.3, random_state=21)

lr = LinearRegression()
lr.fit(X_fish_train, y_fish_train, learning_rate=1e-6, decay=1e-3, regularization='l1', momentum=0.5, lbd=0.5)

pred = lr.predict(X_fish_test, add_bias=True)

r2 = r2_score(y_fish_test, pred)
r2

0.5111402628171239

In [9]:
from sklearn.metrics import r2_score

X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(df_x, df_y, test_size=0.3, random_state=21)

lr = LinearRegression()
lr.fit(X_fish_train, y_fish_train, learning_rate=1e-6, decay=1e-3, regularization='l2', momentum=0.5, lbd=0.5)

pred = lr.predict(X_fish_test, add_bias=True)

r2 = r2_score(y_fish_test, pred)
r2

0.49445357549938684

In [10]:
from sklearn.metrics import r2_score

X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(df_x, df_y, test_size=0.3, random_state=21)

lr = LinearRegression()
lr.fit(X_fish_train, y_fish_train, learning_rate=1e-4, decay=1e-5, regularization='l1', momentum=0.7, lbd=0.7)

pred = lr.predict(X_fish_test, add_bias=True)

r2 = r2_score(y_fish_test, pred)
r2

0.8883128103547795

In [11]:
from sklearn.metrics import r2_score

X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(df_x, df_y, test_size=0.3, random_state=21)

lr = LinearRegression()
lr.fit(X_fish_train, y_fish_train, learning_rate=1e-4, decay=1e-5, regularization='l2', momentum=0.7, lbd=0.7)

pred = lr.predict(X_fish_test, add_bias=True)

r2 = r2_score(y_fish_test, pred)
r2

0.7232117088627135

## 4.1 ¿Cuál fue la combinación de parámetros que le proveyó el mejor resultado?


> Viendo los resultados de las 4 pruebas que se lograron hacer, la combinación de parametros que brindó mejor resultado fue: 
```
learning_rate=1e-4, decay=1e-5, regularization='l1', momentum=0.7, lbd=0.7
```

> Esto dió como resultado un `r2 score` de `0.8883647017635607`





## 4.2 ¿Qué pasa si utiliza esa misma combinación pero cambia la semilla del train_test_split?  
Pruebe con varias semillas

In [12]:
from sklearn.metrics import r2_score


X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(df_x, df_y, test_size=0.3, random_state=53)

lr = LinearRegression()
lr.fit(X_fish_train, y_fish_train, learning_rate=1e-4, decay=1e-5, regularization='l1', momentum=0.7, lbd=0.7)

pred = lr.predict(X_fish_test, add_bias=True)

r2 = r2_score(y_fish_test, pred)
r2

0.9356869264610277

In [13]:
from sklearn.metrics import r2_score


X_fish_train, X_fish_test, y_fish_train, y_fish_test = train_test_split(df_x, df_y, test_size=0.3, random_state=74)

lr = LinearRegression()
lr.fit(X_fish_train, y_fish_train, learning_rate=1e-4, decay=1e-5, regularization='l1', momentum=0.7, lbd=0.7)

pred = lr.predict(X_fish_test, add_bias=True)

r2 = r2_score(y_fish_test, pred)
r2

0.9020686499545606

## 4.3 Si pasa algo inusual: ¿Por qué cree que pasa esto?


> Analizando los resultado, vemos que se generan `r2 score` mejores que los de las pruebas pasadas. Esto puede ser por la naturaleza de los datos, ya que al variar el set de entrenamiento el modelo puede que genere un mejor `fit` que los anteriores. 

