# Lasso Regression - Breast Cancer Prediction

In [1]:
# built-in
import warnings

# third-party
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
import matplotlib.pyplot as plt

In [2]:
# environment setup
warnings.filterwarnings("ignore")

Loading data from sklearn datasets library:

In [3]:
breast_cancer_dataset = sklearn.datasets.load_breast_cancer()

In [4]:
# splitting vectors
X = pd.DataFrame(
    breast_cancer_dataset.data,
    columns=breast_cancer_dataset.feature_names
)
Y = pd.Series(breast_cancer_dataset.target, name="target")

We have 30 descriptive features.

Some will contribute more to the results of the model, others less or not at all.

In [5]:
data = pd.concat([X, Y], axis=1)
print("Features: ", X.columns)

Features:  Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')


Lasso is a regularized linear regression that shrink some coefficients by adding a L1 penalty.

This results in an automatic feature selection, removing features that do not contribute much to the prediction task.

In [6]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

In [7]:
# defining the model and the evaluation method
lasso_model = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000)
cross_validation = RepeatedKFold(n_splits=10, n_repeats=3, random_state=321)

## Hyperparameter Tuning

In [8]:
# defining the Grid Search Cross Validation and passing the search parameters
grid = {"C": np.arange(0.01, 1, 0.01)}
search = GridSearchCV(
    lasso_model,
    grid,
    scoring="neg_mean_absolute_error",
    cv=cross_validation,
    n_jobs=-1
)

In [9]:
# tunning the Lasso Regression model hiperparameters with Grid Search Cross Validation
results = search.fit(x_train, y_train)
lasso_model.fit(x_train, y_train)

In [10]:
print("MAE: %s" % round(results.best_score_, 5))
print("Config: %s" % results.best_params_)

MAE: -0.0513
Config: {'C': 0.87}


## Feature selection

As we can see below, some of the coefficients are now zero, because they don't contribute to the predictions.

Keeping these features increases generalism but reduces the model accuracy.

In [11]:
feature_importance = dict(zip(data.columns, lasso_model.coef_[0]))
feature_importance

{'mean radius': 4.676032476733902,
 'mean texture': 0.13861617423350192,
 'mean perimeter': -0.31260077498069366,
 'mean area': -0.015187357927418205,
 'mean smoothness': 0.0,
 'mean compactness': 0.0,
 'mean concavity': 0.0,
 'mean concave points': 0.0,
 'mean symmetry': 0.0,
 'mean fractal dimension': 0.0,
 'radius error': 0.0,
 'texture error': 1.7703818706914807,
 'perimeter error': 0.0,
 'area error': -0.0941036525398894,
 'smoothness error': 0.0,
 'compactness error': 0.0,
 'concavity error': 0.0,
 'concave points error': 0.0,
 'symmetry error': 0.0,
 'fractal dimension error': 0.0,
 'worst radius': 0.0,
 'worst texture': -0.3944082053909299,
 'worst perimeter': -0.054189009992269076,
 'worst area': -0.014895905720181025,
 'worst smoothness': 0.0,
 'worst compactness': 0.0,
 'worst concavity': -3.388136304067478,
 'worst concave points': 0.0,
 'worst symmetry': 0.0,
 'worst fractal dimension': 0.0}

In [12]:
x_train_selected = x_train[[k for k, v in feature_importance.items() if v != 0]]
x_test_selected = x_test[[k for k, v in feature_importance.items() if v != 0]]

## Fitting the model with the best hyperparameter and the selected features

In [13]:
# training the model
lasso_model = LogisticRegression(
    C=results.best_params_["C"],
    penalty="l1",
    solver="liblinear",
    max_iter=1000
)

In [14]:
lasso_model.fit(x_train_selected, y_train)

## Making predictions and checking the model score

In [15]:
predicted_y = lasso_model.predict(x_test_selected)
predicted_y

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0])

In [16]:
print("score:", lasso_model.score(x_test_selected, y_test))

score: 0.9824561403508771
