# Learning Objectives

- Efficiently compute all the pairwise distances among different observations
- Know how to use the weights with the distances  
- Test the locally weighted regressors
- We want to see how to boost their performance; this means boosting, such as gradient boosting

In [12]:
# We will not use anything that we don't know. Graphical libraries are fine tho
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 120

import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.spatial import Delaunay
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy import linalg
from scipy.interpolate import interp1d, LinearNDInterpolator, NearestNDInterpolator
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [13]:
# Gaussian Kernel
def Gaussian(x):
  return np.where(np.abs(x)>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*x**2))

In [14]:
# this is the correct vectorized version
def tricubic(x):
  return np.where(np.abs(x)>1,0,(1-np.abs(x)**3)**3)

In [15]:
# Epanechnikov Kernel
def Epanechnikov(x):
  return np.where(np.abs(x)>1,0,3/4*(1-np.abs(x)**2))

In [16]:
# Quartic Kernel
def Quartic(x):
  return np.where(np.abs(x)>1,0,15/16*(1-np.abs(x)**2)**2)

In [24]:
#real data application will be cars and concrete
data = pd.read_csv('01intro/cars.csv')
x = data.drop(columns=['MPG'])
x
y = data['MPG'].values

# Compute all pairwise distances efficiently

this means we build a general case, assuming two matrices, we want to compute all the pairwise distances between the rows of matrix 1 and the rows of matrix 2

In [27]:
u = np.array(np.random.uniform(2,5,5))
v = np.array(np.random.uniform(1,2,5))

In [29]:
v.shape # this may create a problem

(5,)

In [47]:
u = np.array(np.random.uniform(2,5,(10,5)))
v = np.random.uniform(1,2,(20,5))
u.shape

(10, 5)

In [37]:
# let's show a simple code for defining a distance function
def dist(u,v):
    D = []
    #check if v is one dimensional
    if len(v.shape) == 1:
        v = v.reshape(1,-1)
        
    # we would like all the pairwise combinations if u and v
    # we can avoid two for loops if we consider broadcasting
    for rowj in v:
        D.append(np.sqrt(np.sum((u-rowj)**2,axis=1)))
    return np.array(D)

In [48]:
D = dist(u,v)

In [50]:
D.shape

(20, 10)

In [52]:
D

array([[4.63248702, 4.1699141 , 5.52608987, 4.37344276, 3.12930538,
        4.94745523, 4.05004022, 5.08583698, 4.79554223, 4.77980935],
       [5.09765899, 4.53054111, 5.71386963, 4.93864576, 3.42593976,
        5.28518473, 4.43588842, 5.29311717, 5.16565758, 5.23386493],
       [5.18333559, 4.28870189, 5.65908747, 4.93588931, 3.22401487,
        5.33573561, 4.52719372, 5.0474132 , 4.91587181, 5.04875515],
       [4.91322688, 4.14351438, 5.3677738 , 4.68731969, 3.07252387,
        5.04253374, 4.25426228, 4.82601106, 4.7754398 , 4.8673791 ],
       [5.48030155, 4.87819933, 6.26892062, 5.14199875, 3.6766575 ,
        5.80970359, 4.80066829, 5.55859633, 5.48056815, 5.29658865],
       [5.10171964, 4.24472274, 5.67483402, 4.82549325, 3.01522623,
        5.35255147, 4.39804027, 4.94134631, 4.83515905, 4.82462075],
       [5.39433053, 4.23044565, 6.02715596, 5.11203023, 3.51218954,
        5.56344086, 4.89905093, 5.49562578, 4.88006834, 5.33465673],
       [4.90597849, 3.96493363, 5.5403212

In [None]:
Tricubic(D,tau=0.5)