# PSS 3, Episode 2, Exploring the Lasso Regression Path...
I decided to replicate the entire code from the popular Lasso Regression Notebook with some modifications to make it more easier for me to understand...
From here I will continue building up to increase the model performance...



All Credits To:
https://www.kaggle.com/code/tilii7/modeling-stroke-dataset-with-lasso-regression


# Loading Model Libraries...

In [None]:
%%time
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
from pathlib import Path # Import OS path libraries
from sklearn.preprocessing import LabelEncoder # Encode things
from sklearn.neighbors import KNeighborsRegressor # Import KNN Regressor

from category_encoders.leave_one_out import LeaveOneOutEncoder # Import categorical encoder
from sklearn.preprocessing import StandardScaler # Import standar scaler
from sklearn.linear_model import LassoCV # Import Lasso regressor
from sklearn.model_selection import RepeatedKFold # Repeated Kfold CV strategy 
from sklearn.metrics import mean_squared_error, roc_auc_score # Importing metrics

from scipy import special # Will be used to move distributions to 0-1 range
from sklearn.model_selection import KFold, StratifiedKFold

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Setting Notebook Configuration...

In [None]:
%%time
# I like to disable my Notebook Warnings.
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
# Notebook Configuration...

# Amount of data we want to load into the Model...
DATA_ROWS = None
# Dataframe, the amount of rows and cols to visualize...
NROWS = 50
NCOLS = 15
# Main data location path...
BASE_PATH = '...'

SEED = 228

In [None]:
%%time
# Configure notebook display settings to only use 2 decimal places, tables look nicer.
pd.options.display.float_format = '{:,.3f}'.format
pd.set_option('display.max_columns', NCOLS) 
pd.set_option('display.max_rows', NROWS)

# Reading The Datasets...

In [None]:
%%time
# Load the CSV information into a Pandas DataFrame...
input_path = Path('/kaggle/input/playground-series-s3e2')

trn_df = pd.read_csv(input_path / 'train.csv')
tst_df = pd.read_csv(input_path / 'test.csv')

original = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

submission = pd.read_csv(input_path / 'sample_submission.csv')

In [None]:
%%time
# Rename the field Residence_type, lower case consistency
trn_df = trn_df.rename(columns = {'Residence_type': 'residence_type'})
tst_df = tst_df.rename(columns = {'Residence_type': 'residence_type'})
original = original.rename(columns = {'Residence_type': 'residence_type'})

# Filling NaNs

In [None]:
%%time
# Filling Missing Using KNN...

numeric_feat = ['age', 'avg_glucose_level', 'bmi']
categ_feat = [feat for feat in original.columns if feat not in numeric_feat and feat not in ['stroke', 'id']]

tmp = original.copy()

for col in categ_feat:
    encoder = LabelEncoder()
    tmp[col] = encoder.fit_transform(tmp[col])

orig_trn = tmp[tmp['bmi'].notnull()]
orig_tst = tmp[tmp['bmi'].isnull()]


selected_cols = ['age', 'ever_married', 'work_type', 'smoking_status', 'avg_glucose_level', 'hypertension', 'heart_disease']
knn_model = KNeighborsRegressor(n_neighbors = 128, metric = 'minkowski', n_jobs = -1)
knn_model.fit(orig_trn[selected_cols], orig_trn['bmi'])
dists, nears = knn_model.kneighbors(orig_trn[selected_cols], return_distance = True)


knn_preds = knn_model.predict(orig_tst[selected_cols])
orig_tst['bmi'] = knn_preds

tmp = pd.concat([orig_trn, orig_tst])
original['bmi'] = tmp['bmi']

# Merging Everything Under 1 Dataset...

In [None]:
%%time
trn_df['generated'] = 1
tst_df['generated'] = 1

original['generated'] = 0

trn_df.drop('id', axis = 1, inplace = True)
original.drop('id', axis = 1, inplace = True)
original = original[original['stroke'] == 1]

# Merge both datasets...
#trn_df = pd.concat([trn_df, original], ignore_index = True)
tst_df.drop('id', axis = 1, inplace = True)

# Separating Features...

In [None]:
%%time 
# Creating an Encoding function.
TARGET = 'stroke'
numeric_feat = ['age', 'avg_glucose_level', 'bmi']
categ_feat = [feat for feat in trn_df.columns if feat not in numeric_feat and feat not in ['stroke', 'id']]
features = numeric_feat + categ_feat

In [None]:
%%time 
# Print a list of the features...
features

In [None]:
%%time 
# Creates a backup of the final pre-processed data...
# This will be used for future training of GBDT...
train_df = trn_df.copy()
test_df = tst_df.copy()

# Pre-Processing Datasets for Training and Inference...

In [None]:
%%time
# Label encoding features...
categ_encoder = LeaveOneOutEncoder(cols = categ_feat, random_state = 2022, sigma = 0.05, verbose = 1)
trn_df[features] = categ_encoder.fit_transform(trn_df[features], trn_df[TARGET])
tst_df[features] = categ_encoder.transform(tst_df[features])

In [None]:
# Standarizing features...
scaler = StandardScaler()
scaler.fit(trn_df[features])
trn_df[features] = scaler.transform(trn_df[features])
tst_df[features] = scaler.transform(tst_df[features])

# Lasso Regression...

In [None]:
%%time
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedKFold.html
rkf_grid = list(RepeatedKFold(n_splits=10, n_repeats=10, random_state=228).split(trn_df[features], trn_df[TARGET]))

In [None]:
%%time
# Defining model parameters...
lasso_params = {'precompute':'auto',
                'fit_intercept':True,
                'normalize':False,
                'max_iter':1000,
                'verbose':False,
                'eps':1e-04,
                'cv':rkf_grid,
                'n_alphas':1000,
                'n_jobs':-1,
               }

In [None]:
%%time
# Instenciate the model and train the model
model = LassoCV(**lasso_params)
model.fit(trn_df[features], trn_df[TARGET])

print(" Best alpha value: %.10f" % model.alpha_)
print(" Intercept: %.10f" % model.intercept_)
print(" Lasso CV score: %.10f" % model.score(trn_df[features], trn_df[TARGET]))

In [None]:
%%time
# Calculate the model RMSE and AUC performance in the train dataset
RMSE_nocv = np.sqrt(mean_squared_error(trn_df[TARGET], model.predict(trn_df[features])))
AUC_nocv = roc_auc_score(trn_df[TARGET], model.predict(trn_df[features]))

print("\n Non Cross-Validated Lasso CV RMSE: %.6f" % RMSE_nocv)
print(" Non Cross-Validated AUC: %.6f" % AUC_nocv)
print('\n')

In [None]:
y_preds = model.predict(tst_df[features])
submission['stroke'] = y_preds
submission.head()

# Lasso Regression, Using Repeated Kfold Strategy...

In [None]:
%%time
# Defining model parameters...
lasso_params = {'precompute'   :'auto',
                'fit_intercept':True,
                'normalize'    :False,
                'max_iter'     :1000,
                'verbose'      :False,
                'eps'          :1e-04,
                'n_alphas'     :1000,
                'n_jobs'       :-1,
               }

In [None]:
%%time 
# Defining a more complex Cross Validation loop...
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RepeatedKFold.html

# Define repeated kfold parameters...
N_SPLITS  = 10
N_REPEATS = 10
SEED = 228

# Define placeholder arrays for cv predictions...
array_len = trn_df.shape[0]
multifold_val_pred = np.zeros(array_len)
count_pred = np.zeros(array_len)
multifold_val_targets = np.zeros(array_len)
multifold_rmse = 0
multifold_auc = 0
predictions = np.zeros(tst_df.shape[0])

# Defining a CV strategy...
rkf_grid = list(RepeatedKFold(n_splits = N_SPLITS, n_repeats = N_REPEATS, random_state = SEED).split(trn_df[features], trn_df[TARGET]))
# ----------------------------------

# Initialize cross validation loop...
for idx, (trn_idx, val_idx) in enumerate(rkf_grid):
    
    # Creating subsets for train and validation...
    print("\n Fold %02d" % (idx + 1))
    print('.' * 10)
    X_trn, X_val = trn_df.loc[trn_idx][features], trn_df.loc[val_idx][features]
    y_trn, y_val = trn_df.loc[trn_idx][TARGET], trn_df.loc[val_idx][TARGET]
    
    # Training the ML model...
    model = LassoCV(**lasso_params)
    model.fit(X_trn, y_trn)
    
    # Calculating model predictions on validation set...
    y_val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    val_auc = roc_auc_score(y_val, y_val_pred)
    
    # Evaluating model performance....
    print(" Fold %02d RMSE: %.6f" % ((idx + 1), val_rmse))
    print(" Fold %02d AUC : %.6f" % ((idx + 1), val_auc))
    
    # Generating predictions on the test dataset...
    y_tst_pred = model.predict(tst_df[features])
    predictions += y_tst_pred
    
    # Store all the predictions in the validation set...
    multifold_val_pred[val_idx] += y_val_pred
    count_pred[val_idx] += 1
    multifold_val_targets[val_idx] += y_val
    
    # Calculates RMSE and AUC...
    multifold_rmse += val_rmse
    multifold_auc += val_auc
    
    # --------------------------
    
# Takes the cumulative RMSE and AUC and calculates the average values...
multifold_rmse = multifold_rmse / (N_SPLITS * N_REPEATS)
multifold_auc = multifold_auc / (N_SPLITS * N_REPEATS)

# Calculates the average of the predictions acroos the repeated folds...
avg_val_pred = multifold_val_pred / count_pred
y_true = trn_df[TARGET]

# Calculate the oof metric for the ML model...
oof_rmse = np.sqrt(mean_squared_error(y_true, avg_val_pred))
oof_auc = roc_auc_score(y_true, avg_val_pred)

# Calculate the average and oof scores for the ML model...
print("\n Average RMSE: %.6f" % multifold_rmse)
print(" Average AUC : %.6f" % multifold_auc)
print('.' * 10, '\n')

print(" Out of fold RMSE: %.6f" % oof_rmse)
print(" Out of fold AUC : %.6f" % oof_auc)
print('')

# .....
predictions = predictions / (N_SPLITS * N_REPEATS) 


# --------------------------------

In [None]:
%%time
# Mapping all the predictions from 0-1 range...
oof_result = pd.DataFrame(y_true, columns=['stroke'])
oof_result['prediction'] = special.expit(avg_val_pred)
oof_result.sample(5)

In [None]:
%%time
# Calculate the average of the test dataframe predictions...
lasso_preds = special.expit(predictions)
submission['stroke'] = lasso_preds
submission.to_csv('lasso_submission.csv', index = False)

In [None]:
%%time
# Review the predicted values...
submission

# More Preprocessing Related to GBDT Models...

In [None]:
# Restore the datasets before the label encoding and scaling...
# trn_df = train_df.copy()
# tst_df = test_df.copy()

In [None]:
def normalizing_predictions(model_predictions):
    return (model_predictions - np.min(model_predictions)) / (np.max(model_predictions) - np.min(model_predictions))

In [None]:
def create_onehot(trn_df, tst_df, list_of_var = categ_feat):
    trn_df['is_train'] = 1
    tst_df['is_train'] = 0
    df = pd.concat([trn_df, tst_df])
    df = pd.get_dummies(df, columns = categ_feat)
    
    trn_df = df[df['is_train'] == 1]
    tst_df = df[df['is_train'] == 0]
    
    trn_df = trn_df.drop(['is_train'], axis = 1)
    tst_df = tst_df.drop(['is_train'], axis = 1)
    
    return trn_df,tst_df

# Skipping the one-hot encoded variables already pre-processed
# trn_df, tst_df = create_onehot(trn_df, tst_df, list_of_var = categ_feat)

In [None]:
%%time
# Preprocessing the Information for Training.
TARGET = 'stroke'
features = [feat for feat in trn_df.columns if feat not in [TARGET, 'generated_0', 'generated_1']]

In [None]:
%%time
# Show a list of the features...
features

# XGBoost Model...

In [None]:
%%time
xgb_params = {'n_estimators'     : 16384,
              'min_child_weight' : 96,
              'max_depth'        : 8,
              'learning_rate'    : 0.01,
              'subsample'        : 0.95,
              'colsample_bytree' : 0.95,
              'reg_lambda'       : 1.50,
              'reg_alpha'        : 1.50,
              'gamma'            : 1.50,
              'max_bin'          : 512,
              'random_state'     : SEED,
              'objective'        : 'binary:logistic',
              'tree_method'      : 'hist',
              'eval_metric'      : 'auc'
             }

In [None]:
%%time
# Train an XGBoost Model...
# Create empty lists to store NN information...

score_list   = []
predictions  = [] 

# Define kfolds for training purposes...
kf = StratifiedKFold(n_splits = 10, random_state = SEED, shuffle = True)
# Initialize the training loop...
for fold, (trn_idx, val_idx) in enumerate(kf.split(trn_df[features], trn_df[TARGET])):
    print(f'Training Fold {fold} ...')
    X_train, X_valid = trn_df.iloc[trn_idx][features], trn_df.iloc[val_idx][features]
    y_train, y_valid = trn_df.iloc[trn_idx][TARGET], trn_df.iloc[val_idx][TARGET]
    
    # Create and Train an XGBoost
    model = XGBClassifier(**xgb_params)
    model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], early_stopping_rounds = 512, verbose = 0)
    
    y_valid_pred = model.predict_proba(X_valid.values)[:,1]
    score = roc_auc_score(y_valid, y_valid_pred)
    
    score_list.append(score)
    print(f"Fold {fold}, AUC = {score:.4f}")
    print((''))
    
    tst_pred = model.predict_proba(tst_df[features].values)[:,1]
    predictions.append(tst_pred)

print(f'OOF AUC: {np.mean(score_list):.4f}')
print('.........')

In [None]:
%%time
# Populated the prediction on the submission dataset and creates an output file
xgboost_preds = np.array(predictions).mean(axis = 0)
xgboost_preds = normalizing_predictions(xgboost_preds)
submission['stroke'] = xgboost_preds
submission.to_csv('submission.csv', index = False)

In [None]:
%%time
submission.head()

# LGBM Model...

In [None]:
%%time
lgb_params = {'num_iterations'   : 16384,
              'max_depth'        : 9,
              'learning_rate'    : 0.01,
              'min_child_samples': 36, 
              'num_leaves'       : 128, 
              'colsample_bytree' : 0.80, 
              'subsample'        : 0.90, 
              'subsample_freq'   : 5, 
              'reg_lambda'       : 28,
              'seed'             : SEED,
              'objective'        : 'binary',
              'boosting_type'    : 'gbdt',
              'device'           : 'cpu', 
              'gpu_platform_id'  : 0,
              'gpu_device_id'    : 0,
              'n_jobs'           : -1,
              'metric'           : 'auc',
              'verbose'          : -1,
             }

In [None]:
%%time
# Train an LGBM Model...
# Create empty lists to store NN information...

score_list   = []
predictions  = [] 

# Define kfolds for training purposes...
kf = StratifiedKFold(n_splits = 10, random_state = SEED, shuffle = True)
# Initialize the training loop...
for fold, (trn_idx, val_idx) in enumerate(kf.split(trn_df[features], trn_df[TARGET])):
    print(f'Training Fold {fold} ...')
    X_train, X_valid = trn_df.iloc[trn_idx][features], trn_df.iloc[val_idx][features]
    y_train, y_valid = trn_df.iloc[trn_idx][TARGET], trn_df.iloc[val_idx][TARGET]
    
    # Create and Train an LGBM Model
    model = LGBMClassifier(**lgb_params)
    model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], early_stopping_rounds = 512, verbose = 0)
    
    y_valid_pred = model.predict_proba(X_valid.values)[:,1]
    score = roc_auc_score(y_valid, y_valid_pred)
    
    score_list.append(score)
    print(f"Fold {fold}, AUC = {score:.4f}")
    print((''))
    
    tst_pred = model.predict_proba(tst_df[features].values)[:,1]
    predictions.append(tst_pred)

print(f'OOF AUC: {np.mean(score_list):.4f}')
print('.........')

In [None]:
%%time
# Populated the prediction on the submission dataset and creates an output file
lgbm_preds = np.array(predictions).mean(axis=0)
lgbm_preds = normalizing_predictions(lgbm_preds)
submission['stroke'] = lgbm_preds
submission.to_csv('submission.csv', index = False)

In [None]:
%%time
submission.head()

# CatBoost Model...

In [None]:
cb_params = {'num_boost_round': 10000,
             'depth': 3,
             'learning_rate': 0.01,
             'rsm': 0.5,
             'subsample': 0.931,
             'l2_leaf_reg': 69,
             'min_data_in_leaf': 20,
             'random_strength': 0.175,
             'random_seed': SEED,
             'use_best_model': True,
             'task_type': 'CPU',
             'bootstrap_type': 'Bernoulli',
             'grow_policy': 'SymmetricTree',
             'loss_function': 'Logloss',
             'eval_metric': 'AUC'
            }

In [None]:
%%time
# Train an CatBoost Model...
# Create empty lists to store NN information...

score_list   = []
predictions  = [] 

# Define kfolds for training purposes...
kf = StratifiedKFold(n_splits = 10, random_state = SEED, shuffle = True)
# Initialize the training loop...
for fold, (trn_idx, val_idx) in enumerate(kf.split(trn_df[features], trn_df[TARGET])):
    print(f'Training Fold {fold} ...')
    X_train, X_valid = trn_df.iloc[trn_idx][features], trn_df.iloc[val_idx][features]
    y_train, y_valid = trn_df.iloc[trn_idx][TARGET], trn_df.iloc[val_idx][TARGET]
    
    # Create and Train an CatBoost Model
    model = CatBoostClassifier(**cb_params)
    model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], early_stopping_rounds = 512, verbose = 0)
    
    y_valid_pred = model.predict_proba(X_valid.values)[:,1]
    score = roc_auc_score(y_valid, y_valid_pred)
    
    score_list.append(score)
    print(f"Fold {fold}, AUC = {score:.4f}")
    print((''))
    
    tst_pred = model.predict_proba(tst_df[features].values)[:,1]
    predictions.append(tst_pred)

print(f'OOF AUC: {np.mean(score_list):.4f}')
print('.........')

In [None]:
%%time
# Populated the prediction on the submission dataset and creates an output file
catboost_preds = np.array(predictions).mean(axis=0)
catboost_preds = normalizing_predictions(catboost_preds)
submission['stroke'] = catboost_preds
submission.to_csv('submission.csv', index = False)

In [None]:
%%time
submission.head()

In [None]:
xgboost = 0.20
lgbm = 0.20
catboost = 0.20
lasso = 0.40

blended_predictions = (xgboost * xgboost_preds) + lgbm * lgbm_preds + catboost * catboost_preds + lasso * lasso_preds
submission['stroke'] = blended_predictions
submission.to_csv('blended_submission.csv', index = False)

In [None]:
%%time
submission.head()