In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

from regprepare import get_auto_mpg, train_val_test

In [2]:
df = get_auto_mpg()

# Split

In [3]:
train, val, test = train_val_test(df,stratify='mpg')

# Scale

In [4]:
mms = MinMaxScaler()
train[['displ','horsepower','weight','acc']] = mms.fit_transform(train[['displ','horsepower','weight','acc']])
train.head(1)

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
212,16.5,8,0.727273,0.725275,0.784519,0.244048,76,1,"""cadillac seville"""


# Isolate Target

In [5]:
X_train_scaled = train[['displ','horsepower','weight','acc']]
y_train = train['mpg']

In [6]:
X_train_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 235 entries, 212 to 103
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   displ       235 non-null    float64
 1   horsepower  235 non-null    float64
 2   weight      235 non-null    float64
 3   acc         235 non-null    float64
dtypes: float64(4)
memory usage: 9.2 KB


# SelectKBest

In [7]:
# SelectKBest(stats_test, k=num_features_to_return)

In [8]:
f_selector = SelectKBest(f_regression, k=2)

In [9]:
f_selector.fit(X_train_scaled, y_train)

In [10]:
f_select_mask = f_selector.get_support()

In [11]:
X_train_scaled.columns

Index(['displ', 'horsepower', 'weight', 'acc'], dtype='object')

In [12]:
X_train_scaled.head()

Unnamed: 0,displ,horsepower,weight,acc
212,0.727273,0.725275,0.784519,0.244048
346,0.07013,0.104396,0.128154,0.583333
325,0.051948,0.0,0.133825,0.815476
90,0.932468,0.824176,0.946697,0.208333
246,0.020779,0.021978,0.105472,0.678571


In [14]:
feature_mask = f_selector.get_support()

In [15]:
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [16]:
f_selector.transform(X_train_scaled)[:5]

array([[0.72727273, 0.78451942],
       [0.07012987, 0.12815424],
       [0.05194805, 0.13382478],
       [0.93246753, 0.94669691],
       [0.02077922, 0.10547207]])

# RFE

In [None]:
# RFE(model,n_features_to_select=num_features_to_return)

In [None]:
train.head(1)

In [None]:
X_train = train.drop(columns = ['mpg','model_year','name'])
X_train.head()

In [None]:
X_train = pd.get_dummies(X_train, columns = ['cylinders','origin'])
X_train.head()

In [None]:
lm = LinearRegression()

In [None]:
rfe = RFE(lm, n_features_to_select = 7)

In [None]:
rfe.fit(X_train,y_train)

In [None]:
ranks = rfe.ranking_
columns = X_train.columns.tolist()

In [None]:
features_ranks = pd.DataFrame({"ranking":ranks,"feature":columns})

In [None]:
features_ranks.sort_values('ranking')