In [30]:
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression

import sys
import os
home_directory_path = os.path.expanduser('~')
sys.path.append(home_directory_path +'/utils')
from prepare_utils import split_data

import pandas as pd
import seaborn as sns

1. Load the `tips` dataset.

    1. Create a column named `price_per_person`. This should be the total bill divided by the party size.


In [3]:
# load tips data
tips_df = sns.load_dataset('tips')
# create price per person
tips_df['price_per_person'] = tips_df['total_bill'] / tips_df['size']
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495000
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
2,21.01,3.50,Male,No,Sun,Dinner,3,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,11.840000
4,24.59,3.61,Female,No,Sun,Dinner,4,6.147500
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.676667
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.590000
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.335000
242,17.82,1.75,Male,No,Sat,Dinner,2,8.910000


2. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
    - I think time will and size will predict tip amount most.
3. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [5]:
dummies = pd.get_dummies(data=tips_df[['sex', 'smoker', 'day',
                             'time']], drop_first=True)
df = pd.concat([tips_df[['total_bill', 'tip', 'size', 'price_per_person']],
                dummies],
                axis=1)


In [10]:
train, validate, test = split_data(df, test_size=.15, validate_size=.15,
                                   random_state=125)

In [19]:
X_train = train.drop(columns=['tip'])
y_train = train['tip']

In [20]:
kbest = SelectKBest(score_func=f_regression, k=2)

In [21]:
kbest.fit(X_train, y_train)

In [26]:
pd.DataFrame(index=kbest.feature_names_in_, 
             data={'s':kbest.scores_, 'p':kbest.pvalues_})

Unnamed: 0,s,p
total_bill,141.086432,5.1905089999999996e-24
size,47.565645,1.031522e-10
price_per_person,21.608368,6.732926e-06
sex_Female,1.248297,0.2654733
smoker_No,0.096568,0.756373
day_Fri,0.344614,0.5579663
day_Sat,0.005063,0.9433608
day_Sun,2.198614,0.1400084
time_Dinner,2.175948,0.1420548


In [29]:
list(zip(kbest.feature_names_in_, kbest.get_support()))

[('total_bill', True),
 ('size', True),
 ('price_per_person', False),
 ('sex_Female', False),
 ('smoker_No', False),
 ('day_Fri', False),
 ('day_Sat', False),
 ('day_Sun', False),
 ('time_Dinner', False)]

4. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [31]:
ols = LinearRegression()

rfe = RFE(ols, n_features_to_select=2)

In [32]:
rfe.fit(X_train, y_train)

In [36]:
pd.DataFrame(index=rfe.feature_names_in_, 
             data={'ranking':rfe.ranking_, 's':rfe.support_})

Unnamed: 0,ranking,s
total_bill,3,False
size,1,True
price_per_person,4,False
sex_Female,6,False
smoker_No,1,True
day_Fri,5,False
day_Sat,2,False
day_Sun,7,False
time_Dinner,8,False


In [37]:
list(zip(rfe.feature_names_in_, rfe.get_support()))

[('total_bill', False),
 ('size', True),
 ('price_per_person', False),
 ('sex_Female', False),
 ('smoker_No', True),
 ('day_Fri', False),
 ('day_Sat', False),
 ('day_Sun', False),
 ('time_Dinner', False)]

5. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

- RFE and SelectKBest use different approaches. For example, RFE takes into account feature interdependencies, so it may see size and total_bill are interdependent, so it won't pick both like SelectKBest.

2. Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the `SelectKBest` class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [39]:
def select_kbest(X, y, k):
    kbest = SelectKBest(score_func=f_regression, k=k)
    kbest.fit(X, y)
              
    return list(zip(kbest.feature_names_in_, kbest.get_support()))

In [50]:
select_kbest(X_train, y_train, 2)

[('total_bill', True),
 ('size', True),
 ('price_per_person', False),
 ('sex_Female', False),
 ('smoker_No', False),
 ('day_Fri', False),
 ('day_Sat', False),
 ('day_Sun', False),
 ('time_Dinner', False)]

3. Write a function named `rfe` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the `RFE` class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [51]:
def rfe(X, y, k):
    ols = LinearRegression()

    rfe = RFE(ols, n_features_to_select=k)
    rfe.fit(X, y)
              
    
    return list(zip(rfe.feature_names_in_, rfe.get_support()))

In [52]:
rfe(X_train, y_train, 2)

[('total_bill', False),
 ('size', True),
 ('price_per_person', False),
 ('sex_Female', False),
 ('smoker_No', True),
 ('day_Fri', False),
 ('day_Sat', False),
 ('day_Sun', False),
 ('time_Dinner', False)]

4. Load the `swiss` dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [55]:
from pydataset import data

swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [58]:
select_kbest(swiss.drop(columns=['Fertility']),
             swiss['Fertility'],
             3)

[('Agriculture', False),
 ('Examination', True),
 ('Education', True),
 ('Catholic', True),
 ('Infant.Mortality', False)]

In [59]:
rfe(swiss.drop(columns=['Fertility']),
    swiss['Fertility'],
    3)

[('Agriculture', False),
 ('Examination', True),
 ('Education', True),
 ('Catholic', False),
 ('Infant.Mortality', True)]