In [1]:
import pandas as pd
import numpy as np 

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression

from regprepare import get_auto_mpg, train_val_test

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = get_auto_mpg()
df.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [3]:
train, validate, test = train_val_test(df)
train.shape, validate.shape, test.shape

((235, 9), (78, 9), (79, 9))

In [4]:
mms = MinMaxScaler()

train[['displ', 'horsepower', 'weight', 'acc']] = mms.fit_transform(train[['displ', 'horsepower', 'weight', 'acc']])

train.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
212,16.5,8,0.727273,0.725275,0.784519,0.244048,76,1,"""cadillac seville"""
346,32.3,4,0.07013,0.104396,0.128154,0.583333,81,3,"""subaru"""
325,44.3,4,0.051948,0.0,0.133825,0.815476,80,2,"""vw rabbit c (diesel)"""
90,12.0,8,0.932468,0.824176,0.946697,0.208333,73,1,"""mercury marquis brougham"""
246,32.8,4,0.020779,0.021978,0.105472,0.678571,78,3,"""mazda glc deluxe"""


## Select K Best

Uses statistial tests to compare each feature with the target variable to determine which features have the strongest relationship with the target.

Select K Best (stats test, k = num features to return)

Fit to data

get_support() to generate mask for columns names

In [5]:
X_train_scaled = train[['displ', 'horsepower', 'weight', 'acc']]
y_train = train['mpg']

In [6]:
f_selector = SelectKBest(f_regression, k = 2)

In [7]:
f_selector.fit(X_train_scaled, y_train)

In [8]:
f_select_mask = f_selector.get_support()
f_select_mask

array([ True, False,  True, False])

In [9]:
X_train_scaled.columns.to_list() 

['displ', 'horsepower', 'weight', 'acc']

In [10]:
X_train_scaled.columns[f_select_mask]

Index(['displ', 'weight'], dtype='object')

In [11]:
X_train_scaled.iloc[:,f_select_mask]

Unnamed: 0,displ,weight
212,0.727273,0.784519
346,0.070130,0.128154
325,0.051948,0.133825
90,0.932468,0.946697
246,0.020779,0.105472
...,...,...
72,0.607792,0.646158
107,0.420779,0.333428
272,0.210390,0.352141
352,0.072727,0.217465


In [12]:
f_selector.transform(X_train_scaled)

array([[0.72727273, 0.78451942],
       [0.07012987, 0.12815424],
       [0.05194805, 0.13382478],
       [0.93246753, 0.94669691],
       [0.02077922, 0.10547207],
       [0.12987013, 0.24836972],
       [0.6025974 , 0.45137511],
       [0.64415584, 0.74397505],
       [0.85714286, 0.79642756],
       [0.72727273, 0.58151403],
       [0.4025974 , 0.42755883],
       [0.03376623, 0.11057556],
       [0.18441558, 0.4471222 ],
       [0.46753247, 0.47207258],
       [0.46753247, 0.54068614],
       [0.72727273, 0.80606748],
       [0.13246753, 0.17607031],
       [0.72727273, 0.73603629],
       [0.05194805, 0.14516586],
       [0.22337662, 0.33853133],
       [0.72727273, 0.81910972],
       [0.16883117, 0.24865325],
       [0.16883117, 0.25857669],
       [0.4025974 , 0.50127587],
       [0.21038961, 0.30195634],
       [0.21038961, 0.3790757 ],
       [0.42077922, 0.36518287],
       [0.11948052, 0.17947264],
       [0.72727273, 0.7235611 ],
       [0.72727273, 0.67337681],
       [0.

## RFE 

Recursive feature elimination

Initialize a machine learning model

RFE(mode, n_feature_to_select = num features to return)

Fit to data

.support_to get mask

.ranking_to get ranking of features

In [13]:
train.head()

Unnamed: 0,mpg,cylinders,displ,horsepower,weight,acc,model_year,origin,name
212,16.5,8,0.727273,0.725275,0.784519,0.244048,76,1,"""cadillac seville"""
346,32.3,4,0.07013,0.104396,0.128154,0.583333,81,3,"""subaru"""
325,44.3,4,0.051948,0.0,0.133825,0.815476,80,2,"""vw rabbit c (diesel)"""
90,12.0,8,0.932468,0.824176,0.946697,0.208333,73,1,"""mercury marquis brougham"""
246,32.8,4,0.020779,0.021978,0.105472,0.678571,78,3,"""mazda glc deluxe"""


In [14]:
X_train = train.drop(columns = ['mpg', 'model_year','name',])
X_train

Unnamed: 0,cylinders,displ,horsepower,weight,acc,origin
212,8,0.727273,0.725275,0.784519,0.244048,1
346,4,0.070130,0.104396,0.128154,0.583333,3
325,4,0.051948,0.000000,0.133825,0.815476,2
90,8,0.932468,0.824176,0.946697,0.208333,1
246,4,0.020779,0.021978,0.105472,0.678571,3
...,...,...,...,...,...,...
72,8,0.607792,0.560440,0.646158,0.267857,1
107,6,0.420779,0.285714,0.333428,0.416667,1
272,4,0.210390,0.203297,0.352141,0.571429,1
352,4,0.072727,0.093407,0.217465,0.755952,1


In [None]:
X_train = pd.get_dummies(X_train, columns = ['cylinders', 'origin'])

In [19]:
len(X_train.columns)

12

In [36]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select = 7)

In [37]:
rfe.fit(X_train, y_train)

In [38]:
ranks = rfe.ranking_

columns = X_train.columns.tolist()

In [39]:
feature_ranks = pd.DataFrame({'ranking': ranks,
                             'feature': columns})

In [46]:
feature_ranks.sort_values('ranking').reset_index().drop(columns = ('index'))

Unnamed: 0,ranking,feature
0,1,displ
1,1,horsepower
2,1,weight
3,1,cylinders_3
4,1,cylinders_4
5,1,cylinders_5
6,1,origin_3
7,2,cylinders_6
8,3,origin_2
9,4,origin_1
