In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression

Load the tips dataset.

In [2]:
from pydataset import data

In [3]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [4]:
tips['tip_percentage'] = tips.tip / tips.total_bill

In [5]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


Create a column named price_per_person. This should be the total bill divided by the party size.

In [6]:
tips['price_per_person'] = tips.total_bill / tips['size']

In [7]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

total_bill for tip size

total_bill and tip for tip_percentage

Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [8]:
X = tips.drop(columns=['sex', 'smoker','time', 'tip', 'day'])
y = tips.tip
X.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
1,16.99,2,0.059447,8.495
2,10.34,3,0.160542,3.446667
3,21.01,3,0.166587,7.003333
4,23.68,2,0.13978,11.84
5,24.59,4,0.146808,6.1475


In [9]:
f_selector = SelectKBest(f_regression, k = 2)

In [10]:
f_selector.fit(X, y)

SelectKBest(k=2, score_func=<function f_regression at 0x1214cc8c0>)

In [11]:
X2 = f_selector.transform(X)

In [12]:
X2 = pd.DataFrame(X2)
X2.head()

Unnamed: 0,0,1
0,16.99,2.0
1,10.34,3.0
2,21.01,3.0
3,23.68,2.0
4,24.59,4.0


In [13]:
f_support = f_selector.get_support()
f_feature = X.loc[:,f_support].columns.tolist()
f_feature

['total_bill', 'size']

> KBest recommends: total bill and party size

In [14]:
lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X, y)

In [15]:
X_rfe = pd.DataFrame(X_rfe)
X_rfe.head()

Unnamed: 0,0,1
0,16.99,0.059447
1,10.34,0.160542
2,21.01,0.166587
3,23.68,0.13978
4,24.59,0.146808


In [16]:
mask = rfe.support_
rfe_features = X.loc[:,mask].columns.tolist()
rfe_features

['total_bill', 'tip_percentage']

> RFE recommends: total bill and tip %

Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [17]:
X3 = tips.drop(columns=['sex', 'smoker','time', 'tip_percentage', 'day'])
y3 = tips.tip_percentage
X3.head()

Unnamed: 0,total_bill,tip,size,price_per_person
1,16.99,1.01,2,8.495
2,10.34,1.66,3,3.446667
3,21.01,3.5,3,7.003333
4,23.68,3.31,2,11.84
5,24.59,3.61,4,6.1475


In [18]:
X4 = SelectKBest(f_regression, k = 2).fit_transform(X3, y3)

In [19]:
X4 = pd.DataFrame(X4)
X4.head()

Unnamed: 0,0,1
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5
3,23.68,3.31
4,24.59,3.61


In [20]:
f_support = f_selector.get_support()
f_feature = X3.loc[:,f_support].columns.tolist()
f_feature

['total_bill', 'tip']

> KBest recommends:  total_bill and tip

In [21]:
lm = LinearRegression()
rfe = RFE(lm, 2)
X2_rfe = rfe.fit_transform(X3, y3)

In [22]:
rfe2_features = X3.loc[:, rfe.support_].columns.tolist()
rfe2_features

['tip', 'size']

Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

> Each has a different process for how to calculate features.  It might take me to look at the values for each one and see if we should add one value or eliminate one

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [23]:
def select_kbest(X, y, k):
    """takes in the predictors (X), the target (y), and the number of features to select (k)
    and returns the names of the top k selected features based on the SelectKBest class """
    f_selector = SelectKBest(f_regression, k = k)
    f_selector.fit(X, y)
    X2 = f_selector.transform(X)
    f_support = f_selector.get_support()
    f_feature = X.loc[:,f_support].columns.tolist()
    return f_feature

In [24]:
select_kbest(X=X, y=y, k=2)


['total_bill', 'size']

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [25]:
def rfe(X, y, k):
    """takes in the predictors, the target, and the number of features to select. 
    It should return the top k features based on the RFE class"""
    lm = LinearRegression()
    rfe = RFE(lm, k)
    X_rfe = rfe.fit_transform(X, y)  
    rfe_features = X.loc[:, rfe.support_].columns.tolist()
    return rfe_features

In [26]:
X = tips.drop(columns=['sex', 'smoker','time', 'tip', 'day'])
y = tips.tip
X.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
1,16.99,2,0.059447,8.495
2,10.34,3,0.160542,3.446667
3,21.01,3,0.166587,7.003333
4,23.68,2,0.13978,11.84
5,24.59,4,0.146808,6.1475


In [27]:
rfe(X=X, y=y, k=2)

['total_bill', 'tip_percentage']

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [28]:
swiss = data('swiss')
swiss.head(1)

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2


In [29]:
X = swiss.drop(columns= 'Fertility')
y = swiss.Fertility

In [30]:
select_kbest(X=X,y=y,k=3)

['Examination', 'Education', 'Catholic']

In [31]:
rfe(X=X, y=y, k=3)

['Examination', 'Education', 'Infant.Mortality']