In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_absolute_error, r2_score

From GoDataDriven Advanced Data Science with Python course

# Custom transformer

In [120]:
food_df = pd.read_csv("../data/food_recipes.csv")
food_df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,3-ingredient_recipes,advance_prep_required,alcoholic,almond,...,whole_wheat,wine,winter,wok,yellow_squash,yogurt,zucchini,snack,turkey,ingr_count
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,The Best Blts,4.375,948.0,19.0,79.0,1042.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [121]:
y = food_df.dropna()['calories'].reset_index(drop = True)
X = food_df.dropna().drop(['calories','title','rating'], axis = 'columns').reset_index(drop = True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=111)

### The Assigment

The goal is to only use those features that satisfy the `f_regression` test. The code that did this for you previously is available. However, keep in mind that this code is only compatible with Pandas. While an individual transformer can be compatible with Pandas, a transformer used in a pipeline will receive the output of a previous transformer, i.e. a numpy array.


* Create a feature selection transformer based on the `f_regression` test. 
* Make sure the transformer is compatible with `numpy` arrays, not only Pandas DataFrames.
* Let the threshold (0.05 originally) be a tunable hyperparameter.


In [116]:
from sklearn.base import BaseEstimator, TransformerMixin

class PValuesThreshold(BaseEstimator, TransformerMixin):
    """Some explanation
    """
    def __init__(self, threshold=0.05):
        self.threshold = threshold
        
    def fit(self, X, y):
        _, p_values = f_regression(X, y)
        self.p_values_ = p_values
        return self
    
    def transform(self, X):
        mask = self.p_values_ > self.threshold
        X_transformed = X.loc[:, mask]
        return X_transformed

In [117]:
transformer = PValuesThreshold()

print(f"Number of features before transformation: {X.shape[1]}")

transformer.fit(X, y)
X_transformed = transformer.transform(X)

print(f"Number of features after transformation: {X_transformed.shape[1]}")

Number of features before transformation: 442
Number of features after transformation: 193


In [118]:
transformer = PValuesThreshold()

print(f"Number of features before transformation: {X.shape[1]}")
X_transformed = transformer.fit_transform(X, y)
print(f"Number of features after transformation: {X_transformed.shape[1]}")

Number of features before transformation: 442
Number of features after transformation: 193


In [119]:
model = KNeighborsRegressor()

pipeline = Pipeline(steps=[
    ('threshold', PValuesThreshold(0.05)),
    ('scaler', MinMaxScaler()),
    ('model', model)
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Calculate scores. 
r2 = r2_score(y_test, y_pred).round(3)
MAE = mean_absolute_error(y_test, y_pred).round(3)

# Report. 
print(f'r2 score: {r2}')
print(f'mea: {MAE}')

r2 score: -0.125
mea: 236.631
