# Cars Dataset Example

In [1]:
from ucimlrepo import fetch_ucirepo 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
import featurize as ft

In [2]:
# Download the data and drop the rows with missing values
auto_mpg = fetch_ucirepo(id=9) 
  
X = auto_mpg.data.features 
y = auto_mpg.data.targets 

rows_with_nulls = X.isnull().sum(axis=1)
X = X[rows_with_nulls == 0].reset_index(drop=True)
y = y[rows_with_nulls == 0]["mpg"].reset_index(drop=True)
  
X.head()

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin
0,307.0,8,130.0,3504,12.0,70,1
1,350.0,8,165.0,3693,11.5,70,1
2,318.0,8,150.0,3436,11.0,70,1
3,304.0,8,150.0,3433,12.0,70,1
4,302.0,8,140.0,3449,10.5,70,1


In [3]:
y.head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: mpg, dtype: float64

In [4]:
def cost_function(X, y):
    X_s, y_s = shuffle(X, y, random_state=8888)
    model = LinearRegression()
    scores = cross_val_score(model, X_s, y_s, cv=3, scoring="neg_mean_absolute_error")
    return scores.mean()

In [5]:
features = ft.featurize(
    X, y, selection_cost_func=cost_function, selection_bigger_is_better=True, n_jobs=-1
)

Pruning feature space...: 100%|██████████| 10/10 [00:00<00:00, 244.35it/s]
Creating new features...:  37%|███▋      | 11/30 [00:04<00:08,  2.31it/s]
Optimising feature selection...:  24%|██▍       | 24/100 [00:02<00:07,  9.64it/s]


In [6]:
original = cost_function(X, y)
original

-2.6123923799312774

In [7]:
new = cost_function(features, y)
new

-2.0985448903304533

In [8]:
print(f"Old: {original}, New: {new}, Improvement: {round((1 - (new / original))* 100, 1)}%")

Old: -2.6123923799312774, New: -2.0985448903304533, Improvement: 19.7%
