<a href="https://colab.research.google.com/github/log-ghj/automatic-model-selection/blob/main/ridge_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
import scipy
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

In [19]:
#Read the data
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv').dropna()

#Convert number of rooms into per person values
df["total_rooms_pp"] = df["total_rooms"]/df["population"]
df["total_bedrooms_pp"] = df["total_bedrooms"]/df["population"]
#Average hosuehold size
df["household_size"] = df["population"]/df["households"]
#Drop some variables
df=df.drop(["total_rooms", "total_bedrooms", "households"], axis=1)

#Make the categorical variable into a set of dummies
xx = pd.get_dummies(df.ocean_proximity)
df = pd.concat([df, xx], axis=1, sort=False)
del df["ocean_proximity"]

In [20]:
#Cluster geolocations - determine the optimal number of clusters
from sklearn import preprocessing, cluster
X = df[["latitude","longitude"]]
max_k = 10
## iterations
distortions = [] 
for i in range(1, max_k+1):
    if len(X) >= i:
       model = cluster.KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
       model.fit(X)
       distortions.append(model.inertia_)
## best k: the lowest derivative
k = [i*100 for i in np.diff(distortions,2)].index(min([i*100 for i 
     in np.diff(distortions,2)]))

In [21]:
#Cluster geolocations - create clusters
import scipy.cluster
k = 6
model = cluster.KMeans(n_clusters=k, init='k-means++')
X = df[["latitude","longitude"]]
# clustering
df_X = X.copy()
df_X["cluster"] = model.fit_predict(X)
# find real centroids
closest, distances = scipy.cluster.vq.vq(model.cluster_centers_, df_X.drop("cluster", axis=1).values)
df_X["centroids"] = 0
for i in closest:
    df_X["centroids"].iloc[i] = 1
# add clustering info to the original dataset
df[["cluster"]] = df_X[["cluster"]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [22]:
from sklearn.base import BaseEstimator 
from sklearn.base import RegressorMixin

In [23]:
class self_coded_ridge(BaseEstimator, RegressorMixin):
    def __init__(self, λ=1, **model_hyper_parameters):
        """
        """
        super().__init__()
        self.λ = λ
        self.X_train = None
        # fitted parameters, initialized to None
        self.params = None

    def set_params(self, **params):
        """
        """
        if not params:
            # Simple optimization to gain speed (inspect is slow)
            return self
        valid_params = self.get_params(deep=True)

        nested_params = defaultdict(dict)  # grouped by prefix
        for key, value in params.items():
            key, delim, sub_key = key.partition('__')
            if key not in valid_params:
                raise ValueError('Invalid parameter %s for estimator %s. '
                                 'Check the list of available parameters '
                                 'with `estimator.get_params().keys()`.' %
                                 (key, self))

            if delim:
                nested_params[key][sub_key] = value
            else:
                setattr(self, key, value)
                valid_params[key] = value

        for key, sub_params in nested_params.items():
            valid_params[key].set_params(**sub_params)

        return self

    def fit(self, X_train, y_train):
        """
        """
        X_train, y_train, λ = np.asarray(X_train), np.asarray(y_train), self.λ 
        self.X_train = X_train
        XX = X_train.T@X_train
        β = np.linalg.inv(XX+λ*np.eye(N=len(XX)))@X_train.T@y_train
        self.params = β
        return self.params

    def predict(self, X_test):
        β = self.params
        return X_test@β

    def get_params(self, deep = False):
        return {'λ':self.λ}

In [24]:
# Define a function that does CV on a grid to find optimal value of the hyperparameter
def ridge_self_coded(grid,X,y):
  coefs = []
  ics = []

  for a in grid:
    ridge = self_coded_ridge(a)
    ridge.fit(X, y)
    coefs.append(ridge.params)
    k=10 #k-fold CV
    scores = cross_val_score(ridge, X, y, cv=k, scoring='neg_mean_squared_error')
    RMSE = np.sqrt(sum(abs(scores))/k)
    ics.append(RMSE)
  opt = ics.index(min(ics))
  return coefs[opt], grid[opt], ics[opt]

In [25]:
# Create a similar function for the pre-existing sklearn ridge for comparison
def ridge_sklearn(lambdas, X, y):
  ridge = Ridge(fit_intercept=False, normalize=False) # do not use sklearn's normalization as it divides by L2 and not std.
  coefs = []
  ics = []

  for a in lambdas:
    ridge.set_params(alpha = a)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)
    k=10 #k-fold CV
    scores = cross_val_score(ridge, X, y, cv=k, scoring='neg_mean_squared_error')
    RMSE = np.sqrt(sum(abs(scores))/k)
    ics.append(RMSE)
  opt = ics.index(min(ics))
  return coefs[opt], lambdas[opt], ics[opt]

In [29]:
# create grid of lambdas
# doing what is called naive implementation in the slides -> max value of grid chosen
# efficient way is through SVD -> left to do
grid = 10**np.linspace(4,-2,100)*0.5

# split l.h.s, and r.h.s.
X = df.drop(['median_house_value'], axis=1)
y = df.median_house_value

# add polynomials
poly = PolynomialFeatures(2, include_bias=False)
X = poly.fit_transform(X)

#delete polynomials resulting in 0
X = np.delete(X,X.mean(axis=0)==0,axis=1)

# normalize data
X = (X-X.mean(axis=0))/X.std(axis=0)
y = y-y.mean()

In [30]:
X.shape

(20433, 109)

In [31]:
result_self_coded = ridge_self_coded(grid, X, y)
result_self_coded

(array([-2.13860617e+04, -1.59264425e+04, -4.59055066e+03, -4.90780267e+03,
         1.91874117e+04,  4.73261512e+03,  4.97391006e+03,  1.73191239e+03,
        -2.95457898e+02,  2.76463998e+03,  8.39484512e+02, -3.69211681e+03,
         1.92658516e+01, -3.27139157e+03,  2.08141266e+04,  8.87514157e+03,
         4.31663675e+03,  4.12319570e+03, -1.99675489e+04, -4.12663619e+03,
        -4.57609627e+03, -1.76004685e+03,  1.39558148e+02, -1.87158892e+03,
        -8.35477430e+02,  3.25835012e+03, -1.76078288e+03,  1.49330981e+03,
        -1.67017568e+04, -1.16351397e+04, -4.43925998e+03,  1.44554491e+04,
        -1.90419723e+02,  2.77527631e+02,  1.07265737e+03, -2.09408785e+03,
         7.42055774e+02,  8.36120267e+02, -5.06506086e+03,  4.20223171e+03,
         1.73511665e+03,  5.16679166e+03,  8.49320117e+03,  7.19937486e+03,
         5.51126642e+03,  2.55550857e+04, -2.21778391e+03, -2.52293466e+03,
        -1.00700649e+04, -6.55163565e+02,  9.92251562e+03, -7.95537611e+02,
        -1.5

In [32]:
result_sklearn = ridge_sklearn(grid, X, y)
result_sklearn

(array([-2.13860617e+04, -1.59264425e+04, -4.59055066e+03, -4.90780267e+03,
         1.91874117e+04,  4.73261512e+03,  4.97391006e+03,  1.73191239e+03,
        -2.95457898e+02,  2.76463998e+03,  8.39484512e+02, -3.69211681e+03,
         1.92658516e+01, -3.27139157e+03,  2.08141266e+04,  8.87514157e+03,
         4.31663675e+03,  4.12319570e+03, -1.99675489e+04, -4.12663619e+03,
        -4.57609627e+03, -1.76004685e+03,  1.39558148e+02, -1.87158892e+03,
        -8.35477430e+02,  3.25835012e+03, -1.76078288e+03,  1.49330981e+03,
        -1.67017568e+04, -1.16351397e+04, -4.43925998e+03,  1.44554491e+04,
        -1.90419723e+02,  2.77527631e+02,  1.07265737e+03, -2.09408785e+03,
         7.42055774e+02,  8.36120267e+02, -5.06506086e+03,  4.20223171e+03,
         1.73511665e+03,  5.16679166e+03,  8.49320117e+03,  7.19937486e+03,
         5.51126642e+03,  2.55550857e+04, -2.21778391e+03, -2.52293466e+03,
        -1.00700649e+04, -6.55163565e+02,  9.92251562e+03, -7.95537611e+02,
        -1.5

In [33]:
# instead of the above arbitrarily defined grid, find a grid with the help of svd
# from ols to ridge D**-1 is multiplied by (D**2+λ)**(-1) D**2 (1)
# df of ols are 120 (with poly) -> find λ s.t. (1) = ~120/~0
import scipy
u,s,v = scipy.linalg.svd(X)

In [34]:
sum(s**2/(s**2))

109.0

In [62]:
x1 = 1e-10
x2 = 1_000_000

In [63]:
print(sum(s**2/(s**2+x1)),sum(s**2/(s**2+x2)))

87.99999995154401 1.8619259615164092


In [64]:
# define a new grid with above valuees
new_grid = np.linspace(x1,x2,100)
new_grid = 10**np.linspace(-10,6,100)

In [65]:
new_grid = np.append(new_grid,result_self_coded[1])

In [66]:
ridge_self_coded(new_grid, X, y)

(array([-2.13860617e+04, -1.59264425e+04, -4.59055066e+03, -4.90780267e+03,
         1.91874117e+04,  4.73261512e+03,  4.97391006e+03,  1.73191239e+03,
        -2.95457898e+02,  2.76463998e+03,  8.39484512e+02, -3.69211681e+03,
         1.92658516e+01, -3.27139157e+03,  2.08141266e+04,  8.87514157e+03,
         4.31663675e+03,  4.12319570e+03, -1.99675489e+04, -4.12663619e+03,
        -4.57609627e+03, -1.76004685e+03,  1.39558148e+02, -1.87158892e+03,
        -8.35477430e+02,  3.25835012e+03, -1.76078288e+03,  1.49330981e+03,
        -1.67017568e+04, -1.16351397e+04, -4.43925998e+03,  1.44554491e+04,
        -1.90419723e+02,  2.77527631e+02,  1.07265737e+03, -2.09408785e+03,
         7.42055774e+02,  8.36120267e+02, -5.06506086e+03,  4.20223171e+03,
         1.73511665e+03,  5.16679166e+03,  8.49320117e+03,  7.19937486e+03,
         5.51126642e+03,  2.55550857e+04, -2.21778391e+03, -2.52293466e+03,
        -1.00700649e+04, -6.55163565e+02,  9.92251562e+03, -7.95537611e+02,
        -1.5

In [67]:
ridge_sklearn(new_grid, X, y)

  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


(array([-2.13860617e+04, -1.59264425e+04, -4.59055066e+03, -4.90780267e+03,
         1.91874117e+04,  4.73261512e+03,  4.97391006e+03,  1.73191239e+03,
        -2.95457898e+02,  2.76463998e+03,  8.39484512e+02, -3.69211681e+03,
         1.92658516e+01, -3.27139157e+03,  2.08141266e+04,  8.87514157e+03,
         4.31663675e+03,  4.12319570e+03, -1.99675489e+04, -4.12663619e+03,
        -4.57609627e+03, -1.76004685e+03,  1.39558148e+02, -1.87158892e+03,
        -8.35477430e+02,  3.25835012e+03, -1.76078288e+03,  1.49330981e+03,
        -1.67017568e+04, -1.16351397e+04, -4.43925998e+03,  1.44554491e+04,
        -1.90419723e+02,  2.77527631e+02,  1.07265737e+03, -2.09408785e+03,
         7.42055774e+02,  8.36120267e+02, -5.06506086e+03,  4.20223171e+03,
         1.73511665e+03,  5.16679166e+03,  8.49320117e+03,  7.19937486e+03,
         5.51126642e+03,  2.55550857e+04, -2.21778391e+03, -2.52293466e+03,
        -1.00700649e+04, -6.55163565e+02,  9.92251562e+03, -7.95537611e+02,
        -1.5