In [1]:
import numpy as np
import pandas as pd
import wrangle
from pydataset import data
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

#### 1. Load the tips dataset.

In [2]:
tips = data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips['sex'] = tips.sex.map({'Male': 1, 'Female': 0})
tips['smoker'] = tips.smoker.map({'Yes': 1, 'No': 0})
tips['day'] = tips.day.map({'Sun': 1, 'Mon': 2, 'Tue': 3, 'Wed': 4, 'Thur': 5, 'Fri': 6, 'Sat': 7})
tips['time'] = tips.time.map({'Lunch':0, 'Dinner':1})

In [4]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,0,0,1,1,2
2,10.34,1.66,1,0,1,1,3
3,21.01,3.5,1,0,1,1,3
4,23.68,3.31,1,0,1,1,2
5,24.59,3.61,0,0,1,1,4


In [5]:
tips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int64  
 3   smoker      244 non-null    int64  
 4   day         244 non-null    int64  
 5   time        244 non-null    int64  
 6   size        244 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 15.2 KB


##### a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [6]:
tips['price_per_person'] = tips.total_bill/tips.size

##### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

In [7]:
# total bill, and party size

##### c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [8]:
train, val, test = wrangle.split_data(tips)

In [9]:
X_train, y_train = train.drop(columns=('tip')), train.tip
X_val, y_val = val.drop(columns=('tip')), val.tip
X_test, y_test = test.drop(columns=('tip')), test.tip

In [10]:
kbest = SelectKBest(f_regression, k=1)
kbest.fit(X_train, y_train)

SelectKBest(k=1, score_func=<function f_regression at 0x14d862d30>)

In [11]:
X_train.columns[kbest.get_support()].tolist()

['total_bill']

##### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [12]:
model = LinearRegression()

In [13]:
rfe = RFE(model, n_features_to_select=1)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=1)

In [14]:
pd.DataFrame({'rfe_ranking' : rfe.ranking_}, index=X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,4
sex,3
smoker,6
day,5
time,2
size,1
price_per_person,7


In [15]:
# time and size were the best for RFE

##### e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

In [16]:
# because rfe is using the model, it weighs the features differently than kbest does. From 1 to 4, they still do not agree on the best features.

#### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [17]:
def select_kbest(X_train, y_train, k_features):
    '''
    This function takes in X_train, y_train, and the number of features
    to select and returns the names of the selected features using SelectKBest
    from sklearn. 
    '''
    kbest = SelectKBest(f_regression, k=k_features)
    kbest.fit(X_train, y_train)
    
    print(X_train.columns[kbest.get_support()].tolist())

In [18]:
select_kbest(X_train, y_train, 2)

['total_bill', 'price_per_person']


#### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [19]:
def select_rfe(X_train, y_train, k_features):
    '''
    This function takes in X_train, y_train, and the number of features
    to select and returns the names of the selected features using Recursive
    Feature Elimination from sklearn. 
    '''
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=k_features)
    rfe.fit(X_train, y_train)
    
    print(X_train.columns[rfe.support_].tolist())

In [20]:
select_rfe(X_train, y_train, 2)

['time', 'size']


#### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [21]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [22]:
train, val, test = wrangle.split_data(swiss)

In [23]:
X_train, y_train = train.drop(columns=('Fertility')), train.Fertility
X_val, y_val = val.drop(columns=('Fertility')), val.Fertility
X_test, y_test = test.drop(columns=('Fertility')), test.Fertility

In [24]:
select_kbest(X_train, y_train, 3)

['Examination', 'Catholic', 'Infant.Mortality']


In [25]:
select_rfe(X_train, y_train, 3)

['Examination', 'Education', 'Infant.Mortality']
