In [1]:
from ucimlrepo import fetch_ucirepo 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
import featurize as ft
import numpy as np

np.random.seed(8888)

## Download data

In [2]:
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets["Type_of_glass"]

X.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


In [3]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Type_of_glass, dtype: int64

## Baseline model

In [4]:
def cost_function(X, y):
    model = LinearRegression()
    scores = cross_val_score(model, X, y, cv=3, scoring="neg_mean_absolute_error")
    return scores.mean()

## Featurize

In [5]:
features = ft.featurize(
    X,
    y,
    selection_cost_func=cost_function,
    selection_bigger_is_better=True,
    n_jobs=-1,
    generate_parsimony_coefficient=0.01,
)

Pruning feature space...: 100%|██████████| 10/10 [00:00<00:00, 462.15it/s]
Creating new features...:  63%|██████▎   | 19/30 [00:05<00:03,  3.37it/s]
Optimising feature selection...:  35%|███▌      | 35/100 [00:02<00:05, 12.62it/s]


In [6]:
original = cost_function(X, y)
original

-1.857612121491668

In [7]:
new = cost_function(features, y)
new

-1.6212323841018106

In [8]:
print(f"Old: {original}, New: {new}, Improvement: {round((1 - (new / original))* 100, 1)}%")

Old: -1.857612121491668, New: -1.6212323841018106, Improvement: 12.7%
