In [29]:
import featurize as ft
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

## Download the dataset

In [4]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

data = pd.DataFrame(data)
target = pd.Series(target)
data.columns = [str(x) for x in data.columns]

## Baseline model

In [43]:
N_SPLITS = 5
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)

for idx, (train_idx, test_idx) in enumerate(strat_kf.split(data, target)):
        X_train, X_test = data.iloc[train_idx], data.iloc[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        clf = LinearRegression()
        clf.fit(X_train,y_train)

        preds = clf.predict(X_test)
        loss = mean_absolute_error(y_test, preds)
        scores[idx] = loss

print(f"mean score: {scores.mean():.5f}")

mean score: 3.44424


## Featurize Data

In [50]:
feats = ft.featurize(
    data,
    target,
    problem_type="regression",
    feature_depth=1,
    mrmr_k=50,
    swarm_particles=50,
    swarm_iters=100,
)

INFO:featurize.logging:Checking arguments to featurize function are in within acceptable bounds
INFO:featurize.logging:Inferring initial dataframe schema
INFO:featurize.logging:Featurizing dataframe at depth 1
INFO:featurize.logging:Adding numerical features
INFO:featurize.logging:Removed 0 zero variance columns
INFO:featurize.logging:Adding combination features
INFO:featurize.logging:Removed 27 zero variance columns
INFO:featurize.logging:Infering schema
INFO:featurize.logging:Selecting features using MRMR algorithm
INFO:featurize.logging:Initializing MaxRelevanceMinRedundancy class
INFO:featurize.logging:Fitting and transforming the data using the selected features
INFO:featurize.logging:Fitting mrmr algorithm to the data
INFO:featurize.logging:Setting mrmr k to 50
Pruning feature space...:   0%|          | 0/50 [00:00<?, ?it/s]INFO:featurize.logging:Starting feature pruning with 13662 features
Pruning feature space...: 100%|██████████| 50/50 [00:26<00:00,  1.84it/s]INFO:featurize.lo

In [51]:
N_SPLITS = 5
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)

for idx, (train_idx, test_idx) in enumerate(strat_kf.split(feats, target)):
        X_train, X_test = feats.iloc[train_idx], feats.iloc[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        clf = LinearRegression()
        clf.fit(X_train,y_train)

        preds = clf.predict(X_test)
        loss = mean_absolute_error(y_test, preds)
        scores[idx] = loss

print(f"mean score: {scores.mean():.5f}")

mean score: 2.87762


In [52]:
import featuretools as ftl

In [69]:
es = ftl.EntitySet(id='boston')

es = es.add_dataframe(
      dataframe_name="data",
      dataframe=data,
      index="index",
)

feature_matrix, feature_defs = ftl.dfs(
    entityset=es, 
    target_dataframe_name='data', 
    verbose=True, 
    max_depth=1,
    trans_primitives=['add_numeric', 'multiply_numeric', 'subtract_numeric', ]
)

feature_matrix



Built 247 features
Elapsed: 00:00 | Progress: 100%|██████████


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,5 - 6,5 - 7,5 - 8,5 - 9,6 - 7,6 - 8,6 - 9,7 - 8,7 - 9,8 - 9
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,...,-58.625,2.4850,5.575,-289.425,61.1100,64.2,-230.8,3.0900,-291.9100,-295.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,...,-72.479,1.4539,4.421,-235.579,73.9329,76.9,-163.1,2.9671,-237.0329,-240.0
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,...,-53.915,2.2179,5.185,-234.815,56.1329,59.1,-180.9,2.9671,-237.0329,-240.0
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,...,-38.802,0.9358,3.998,-215.002,39.7378,42.8,-176.2,3.0622,-215.9378,-219.0
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,...,-47.053,1.0848,4.147,-214.853,48.1378,51.2,-167.8,3.0622,-215.9378,-219.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,...,-62.507,4.1144,5.593,-266.407,66.6214,68.1,-203.9,1.4786,-270.5214,-272.0
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,...,-70.580,3.8325,5.120,-266.880,74.4125,75.7,-196.3,1.2875,-270.7125,-272.0
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,...,-84.024,4.8085,5.976,-266.024,88.8325,90.0,-182.0,1.1675,-270.8325,-272.0
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,...,-82.506,4.4051,5.794,-266.206,86.9111,88.3,-183.7,1.3889,-270.6111,-272.0


In [70]:
N_SPLITS = 5
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)

for idx, (train_idx, test_idx) in enumerate(strat_kf.split(feature_matrix, target)):
        X_train, X_test = feature_matrix.iloc[train_idx], feature_matrix.iloc[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        clf = LinearRegression()
        clf.fit(X_train,y_train)

        preds = clf.predict(X_test)
        loss = mean_absolute_error(y_test, preds)
        scores[idx] = loss

print(f"mean score: {scores.mean():.5f}")

mean score: 2.52231
