In [21]:
import featurize as ft
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.linear_model import LinearRegression

In [22]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [23]:
data = pd.DataFrame(data)
target = pd.Series(target)
data.columns = [str(x) for x in data.columns]

In [24]:
N_SPLITS = 5
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)

for idx, (train_idx, test_idx) in enumerate(strat_kf.split(data, target)):
        X_train, X_test = data.iloc[train_idx], data.iloc[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        cb_clf = LinearRegression()
        cb_clf.fit(X_train,y_train)

        preds = cb_clf.predict(X_test)
        loss = mean_absolute_error(y_test, preds)
        scores[idx] = loss

print(f"mean score: {scores.mean():.5f}")

mean score: 3.44424


In [25]:
feats = ft.featurize(data)
feats.shape

(506, 16873)

In [29]:
N_SPLITS = 5
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)

for idx, (train_idx, test_idx) in enumerate(strat_kf.split(feats, target)):
        X_train, X_test = feats.iloc[train_idx], feats.iloc[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        cb_clf = LinearRegression()
        cb_clf.fit(X_train,y_train)

        preds = cb_clf.predict(X_test)
        loss = mean_absolute_error(y_test, preds)
        scores[idx] = loss
        
print(f"mean score: {scores.mean():.5f}")

mean score: 3.72886


In [26]:
feat_names = ft.selection.mrmr(feats, target, 100)

In [28]:
N_SPLITS = 5
strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
scores = np.empty(N_SPLITS)

for idx, (train_idx, test_idx) in enumerate(strat_kf.split(feats, target)):
        X_train, X_test = feats[feat_names].iloc[train_idx], feats[feat_names].iloc[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        cb_clf = LinearRegression()
        cb_clf.fit(X_train,y_train)

        preds = cb_clf.predict(X_test)
        loss = mean_absolute_error(y_test, preds)
        scores[idx] = loss
        
print(f"mean score: {scores.mean():.5f}")

mean score: 2.84811


In [7]:
def cost_function(x0, X, y):
    N_SPLITS = 5
    strat_kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=8888)
    scores = np.empty(N_SPLITS)

    for idx, (train_idx, test_idx) in enumerate(strat_kf.split(data, target)):
            X_train, X_test = X[x0].iloc[train_idx], X[x0].iloc[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            cb_clf = LinearRegression()
            cb_clf.fit(X_train,y_train)

            preds = cb_clf.predict(X_test)
            loss = mean_absolute_error(y_test, preds)
            scores[idx] = loss

    return scores.mean()

In [8]:

x0 = np.random.randint(2, size=len(feats.columns))
ft.selection.pso(cost_function, feats, target, x0, [0, 1], 10, 10)

# def pso(costFunc, X, y, x0, bounds, num_particles, maxiter, verbose=False):

KeyError: "None of [Index([                                   ('3', 1),\n                                          ('6', 1),\n                                          ('7', 1),\n                                          ('9', 1),\n                                         ('10', 1),\n                                         ('12', 1),\n                                     ('ABS(0)', 1),\n                                  ('COSINE(0)', 1),\n                              ('PERCENTILE(0)', 1),\n                                    ('SINE(0)', 1),\n       ...\n             ('SUBTRACT(SINE(12), SQUARE(12))', 1),\n                    ('PLUS(SINE(12), TAN(12))', 1),\n         ('SUBTRACT(SINE(12), RECIPROCAL(12))', 1),\n             ('SUBTRACT(SQRT(12), SQUARE(12))', 1),\n                 ('PLUS(SQRT(12), SQUARE(12))', 1),\n                ('SUBTRACT(SQRT(12), TAN(12))', 1),\n                    ('PLUS(SQRT(12), TAN(12))', 1),\n       ('SUBTRACT(SQUARE(12), RECIPROCAL(12))', 1),\n           ('PLUS(SQUARE(12), RECIPROCAL(12))', 1),\n              ('PLUS(TAN(12), RECIPROCAL(12))', 1)],\n      dtype='object', length=8402)] are in the [columns]"

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

In [None]:
pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
])

search = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,1,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=1,n_jobs=-1)

search.fit(feats,target)

In [None]:
search.best_params_

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)

feature_cols = feats.columns[importance > 0]

feature_cols

In [None]:
len(feature_cols)

In [None]:
data.shape

In [None]:
feats[feature_cols]

In [None]:
ft.transformations.combinations.transfomers

In [None]:
import pandas as pd
from sklearn.feature_selection import f_regression


f_stat = pd.Series(f_regression(data, target)[0], index = data.columns)
corr = data.corr().abs().clip(.00001) 

In [None]:
selected = []
not_selected = list(data.columns)
K = 3

for i in range(K):
    
    score = f_stat.loc[not_selected] / corr.loc[not_selected, selected].mean(axis=1).fillna(.00001)
    
    best = score.index[score.argmax()]
    selected.append(best)
    not_selected.remove(best)

selected    

In [None]:
selected


In [None]:
i = 1