# Abalone Dataset Example

In [1]:
from ucimlrepo import fetch_ucirepo 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
import featurize as ft

## Download data

In [2]:
# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes) 
X = abalone.data.features 
y = abalone.data.targets["Rings"] 

for label in "MFI":
    X[label] = (X["Sex"] == label).astype(int)
del X["Sex"]

X.head()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,M,F,I
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,1,0,0
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,1,0,0
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,0,1,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,1,0,0
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,0,1


In [3]:
y.head()

0    15
1     7
2     9
3    10
4     7
Name: Rings, dtype: int64

## Cost Function

In [4]:
def cost_function(X, y):
    X_s, y_s = shuffle(X, y, random_state=8888)
    model = LinearRegression()
    scores = cross_val_score(model, X_s, y_s, cv=3, scoring="neg_mean_absolute_error")
    return scores.mean()

## Featurize

In [5]:
features = ft.featurize(
    X,
    y, 
    generate_num_features=25,
    selection_cost_func=cost_function, 
    selection_bigger_is_better=True, 
    n_jobs=-1, 
)

Pruning feature space...: 100%|██████████| 25/25 [00:00<00:00, 98.77it/s] 
Creating new features...:  70%|███████   | 21/30 [00:08<00:03,  2.45it/s]
Optimising feature selection...:  42%|████▏     | 42/100 [00:07<00:09,  5.99it/s]


In [6]:
original = cost_function(X, y)
original

-1.587144974165801

In [7]:
new = cost_function(features, y)
new

-1.5540908090070833

In [8]:
print(f"Old: {original}, New: {new}, Improvement: {round((1 - (new / original))* 100, 1)}%")

Old: -1.587144974165801, New: -1.5540908090070833, Improvement: 2.1%
