## KFold CV

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('diabetes.csv')
del df['Outcome']

In [3]:
# Let's predict BMI
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
            'Insulin', 'DiabetesPedigreeFunction']
X = df.loc[:, features].values
y = df.loc[:, ['BMI']].values

In [4]:
# Model: Linear Regression
class LinReg:
    """
    This implementation uses the Newton's Method for optimization.
    """
    def __init__(self, num_iters=5, tolerance = 1e-10, epsilon = 10e-8):
        self.num_iters = num_iters
        self.tolerance = tolerance
        self.epsilon = epsilon # subtracted to make hessian invertible
        
    def add_ones(self, X):
        return np.concatenate((np.ones((len(X),1)), X), axis = 1)
    
    def cost(self, X, y_true):
        return np.mean((X@self.theta - y_true)**2)
    
    def fit(self, X, y):
        X = X.copy()
        X = self.add_ones(X)
        y = y.reshape(-1, 1)
        
        self.theta = np.zeros((len(X[0]), 1))
        hessian_inv = np.linalg.inv(X.T@X + self.epsilon*np.eye(X.T.shape[0]))
        current_iter = 1
        norm = 1
        while (norm >= self.tolerance and current_iter < self.num_iters):
            old_theta = self.theta.copy()
            grad = (X.T@X)@self.theta - X.T@y
            #grad = (1/len(X)) * np.sum((X@self.theta - y)*X, axis=0)
            grad= grad.reshape(-1, 1)
            self.theta = self.theta - hessian_inv@grad
            
            #print(f'cost for {current_iter} iteration : {self.cost(X, y)}')
            norm = np.linalg.norm(old_theta - self.theta)
            current_iter += 1
            
        return self.theta
    
    def evaluate(self, X, y):
        X = self.add_ones(X)
        return self.cost(X, y)
    
    def predict(self, X):
        X = self.add_ones(X)
        
        return X@self.theta

In [5]:
class KFoldCrossVal:
    """
    Performs k-fold cross validation on each combination of hyperparameter set
    
    Input
    ............
    X : Features (m, n)
    y : target (m, 1)
    hyperparameter_Set : Dictionary of hyperparameters for k-fold
    num_of_folds: Number of folds, k; default:10
    verbose: Checks whether to print parameters on every iteration; Boolean; Default: False
    """
    def __init__(self, hyperparameter_set, num_of_folds=10, verbose=True):
        self.hyperparameter_set = hyperparameter_set
        self.k = num_of_folds
        self.verbose = verbose
        
    def shuffle_data(self, X, y):
        shuffle_arr = np.random.permutation(len(X))
        X_shuffled = X[shuffle_arr]
        y_shuffled = y[shuffle_arr].reshape(-1, 1)
        
        return X_shuffled, y_shuffled
    
    def get_kfold_arr_index(self, subset_size, last_index):
        array_indexes = [0]
        for fold_no in range(self.k):
            if fold_no != (self.k-1):
                array_indexes.append((fold_no+1)*subset_size)
            elif fold_no == (self.k - 1): # To accomodate examples not part of the 
                array_indexes.append(last_index) #for last index
        return array_indexes
    
    def get_split_data_fold(self, X, y, array_indexes, fold_no):
        start = array_indexes[fold_no]
        end = array_indexes[fold_no+1]
        X_val = X[start: end]
        y_val = y[start: end]
        
        X_train = np.delete(X, [start,end], axis=0)
        y_train = np.delete(y, [start,end]).reshape(-1,1)
        
        return X_train, y_train, X_val, y_val
    
    def get_hyperparameter_sets(self, hyperparameter_dict):   
        """
        Converts the hyperparameter dictionary into all possible combinations of hyperparameters

        Return
        ..............
        Array of hyperparameter set
        """
        import itertools

        parameter_keys = hyperparameter_dict.keys()
        parameter_values = hyperparameter_dict.values()

        parameter_array = []
        for params in itertools.product(*parameter_values):
            parameter_array.append(params)

        parameter_sets = []
        for parameter_values in parameter_array:
            parameter_set = dict(zip(parameter_keys, parameter_values))
            parameter_sets.append(parameter_set)

        return parameter_sets
    
    def evaluate(self, X, y, model=LinReg):
        # Check if fold is within limit
        if self.k > len(X):
            print(f'K Fold number greater than number of examples')
            return
        
        models = self.get_hyperparameter_sets(self.hyperparameter_set)
        
        print(f'Performing {len(models) * self.k} cross validations for {len(models)} models' )
        m = len(X)
        
        generalization_mse = []
        X, y = self.shuffle_data(X, y)
        subset_size = int(m/self.k)
        
        array_indexes = self.get_kfold_arr_index(subset_size, m+1)
        
        for hyperparameters in models:
            model_ = model(**hyperparameters)
            fold_mse_arr = []
            for fold_no in range(self.k - 1):
                X_train, y_train, X_val, y_val = self.get_split_data_fold(X,
                                                                          y,
                                                                          array_indexes,
                                                                          fold_no)
                model_.fit(X_train, y_train)
                mse = model_.evaluate(X_val, y_val)
                fold_mse_arr.append(mse)
            cv_mse = np.mean(fold_mse_arr)
            
            if np.isnan(cv_mse): # nan values are taken to be smallest so had to convert 
                cv_mse = np.inf
            if self.verbose:
                print(f'{hyperparameters}, mse: {cv_mse}')
            generalization_mse.append(cv_mse)
            
        lowest_gen_mse_index = np.argmin(generalization_mse)
        lowest_mse = generalization_mse[lowest_gen_mse_index]
        best_model = models[lowest_gen_mse_index]
        
        return lowest_mse, best_model

In [6]:
hyp = {
    'tolerance': [1e-3, 1e-5, 1e-7, 1e-10],
    'epsilon': [10e-6, 10e-8, 10e-10],
    'num_iters': [1, 2, 3, 4]
}

kcv = KFoldCrossVal(hyp, 10, True)

In [7]:
kcv.evaluate(X, y)

Performing 480 cross validations for 48 models
{'tolerance': 0.001, 'epsilon': 1e-05, 'num_iters': 1}, mse: 1071.933187134503
{'tolerance': 0.001, 'epsilon': 1e-05, 'num_iters': 2}, mse: 46.22200082213093
{'tolerance': 0.001, 'epsilon': 1e-05, 'num_iters': 3}, mse: 46.22200110454723
{'tolerance': 0.001, 'epsilon': 1e-05, 'num_iters': 4}, mse: 46.22200110454723
{'tolerance': 0.001, 'epsilon': 1e-07, 'num_iters': 1}, mse: 1071.933187134503
{'tolerance': 0.001, 'epsilon': 1e-07, 'num_iters': 2}, mse: 46.222001101723
{'tolerance': 0.001, 'epsilon': 1e-07, 'num_iters': 3}, mse: 46.22200110454736
{'tolerance': 0.001, 'epsilon': 1e-07, 'num_iters': 4}, mse: 46.22200110454736
{'tolerance': 0.001, 'epsilon': 1e-09, 'num_iters': 1}, mse: 1071.933187134503
{'tolerance': 0.001, 'epsilon': 1e-09, 'num_iters': 2}, mse: 46.22200110451907
{'tolerance': 0.001, 'epsilon': 1e-09, 'num_iters': 3}, mse: 46.222001104547296
{'tolerance': 0.001, 'epsilon': 1e-09, 'num_iters': 4}, mse: 46.222001104547296
{'tol

(46.22200082213093, {'tolerance': 0.001, 'epsilon': 1e-05, 'num_iters': 2})

## Backward Selection

In [8]:
# Assuming this is our chosen best hyperparameters
hyperparams = {
    'num_iters': [4],
    'tolerance': [1e-8],
    'epsilon': [10e-10]
}

X_fs = df.loc[:, features]
y_fs = df.loc[:, ['BMI']]

In [9]:
class SelectFeatureBackward:
    """
    input
    ................
    num_of_features : Total features to return; default: 'all'
    """
    def __init__(self, max_features):
            self.max_features = max_features 
    
    def get_best_feat_subset(self, sub_feature_map):
        """
        input
        ..........
        sub_feature_map : dictionary of features and their mse
        output
        ..........
        best_feature_set: array of features
        """
        sorted_feat = sorted(sub_feature_map.items(), key=lambda x: x[1])
        return sorted_feat[0]
        
    def evaluate(self, X, y):
        """
        input
        .....
        X : Input features, dtype: DataFrame
        y : target, dtype: DataFrame or Series
        
        output
        ......
        Best Feature Set
        """
        feature_map = {}
        #while feature_set:
        features = X.columns.to_list()
        feature_set = features
        
        while (len(feature_set)-1 >= self.max_features):
            sub_feature_map = {}
            for feature in feature_set:
                sub_feature_set = list(set(feature_set) - set([feature]))
                cv = KFoldCrossVal(hyperparams, verbose=False)
                mse, _ = cv.evaluate(X.loc[:, sub_feature_set].values, y.values)
                sub_feature_map[tuple(sub_feature_set)] = mse # Stores the feature sets for each n feature combinations
                feature_map[tuple(sub_feature_set)] = mse # Stores the global feature sets
            old_feature_set = feature_set[:]
            feature_set = self.get_best_feat_subset(sub_feature_map)[0]
        #print(pd.DataFrame(feature_map.items(), columns=['feature set', 'mse']))
        return self.get_best_feat_subset(feature_map), feature_map

In [10]:
fs = SelectFeatureBackward(max_features=2)

In [11]:
best_feature, feature_map = fs.evaluate(X_fs, y_fs)

Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models
Performing 10 cross validations for 1 models


In [12]:
best_feature

(('Insulin',
  'Glucose',
  'SkinThickness',
  'BloodPressure',
  'DiabetesPedigreeFunction'),
 46.0881399346017)

In [13]:
pd.DataFrame(feature_map.items(), columns=['feature set', 'mse'])

Unnamed: 0,feature set,mse
0,"(Insulin, Glucose, SkinThickness, BloodPressur...",46.08814
1,"(Insulin, SkinThickness, BloodPressure, Pregna...",48.898433
2,"(Insulin, Glucose, SkinThickness, Pregnancies,...",49.178491
3,"(Insulin, Glucose, BloodPressure, Pregnancies,...",54.064552
4,"(Glucose, SkinThickness, BloodPressure, Pregna...",48.129593
5,"(Insulin, Glucose, SkinThickness, BloodPressur...",46.629694
6,"(Glucose, SkinThickness, BloodPressure, Diabet...",47.304539
7,"(SkinThickness, BloodPressure, DiabetesPedigre...",48.517829
8,"(Glucose, BloodPressure, DiabetesPedigreeFunct...",54.631479
9,"(Glucose, SkinThickness, DiabetesPedigreeFunct...",49.27492
