## _Feature engineering exercises_

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

import wrangle as wr

1. **Load the tips dataset.**

a) Create a column named ```price_per_person```. This should be the ```total bill``` divided by the party ```size```.

In [2]:
# get the train data set from tips data set
tips, _, _ = wr.split_df(sns.load_dataset('tips'))

In [3]:
tips.shape

(136, 7)

In [4]:
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
73,25.28,5.0,Female,Yes,Sat,Dinner,2


In [56]:
# create price_per_person
tips['price_per_person'] = round(tips.total_bill / tips.size, 2)

In [14]:
# create dummies for categorical variables

In [5]:
tips.sex.unique()

['Female', 'Male']
Categories (2, object): ['Male', 'Female']

In [7]:
tips['sex'] = tips.sex.map({'Male':0, 'Female':1})

In [9]:
tips.smoker.unique()

['Yes', 'No']
Categories (2, object): ['Yes', 'No']

In [10]:
tips.day.unique()

['Sat', 'Sun', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [11]:
tips.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [12]:
tips['smoker'] = tips.smoker.map({'Yes':1, 'No':0})
tips['time'] = tips.time.map({'Dinner' :1, 'Lunch':0})

In [13]:
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
73,25.28,5.0,1,1,Sat,1,2


In [26]:
tips['fri'] = np.where(tips.day == 'Fri', 1, 0)

In [28]:
tips['sat'] = np.where(tips.day == 'Sat', 1, 0)
tips['sun'] = np.where(tips.day == 'Sun', 1, 0)

In [50]:
tips['thur'] = np.where(tips.day == 'Thur', 1, 0)

In [57]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,fri,sat,sun,thur,price_per_person
73,25.28,5.0,1,1,Sat,1,2,0,1,0,0,0.02
236,12.6,1.0,0,1,Sat,1,2,0,1,0,0,0.01
12,15.42,1.57,0,0,Sun,1,2,0,0,1,0,0.01
57,26.41,1.5,1,0,Sat,1,2,0,1,0,0,0.02
137,14.15,2.0,1,0,Thur,0,2,0,0,0,1,0.01


In [34]:
# check if the numbers are ok
tips.day.value_counts(), tips.fri.value_counts(), tips.sat.value_counts(), tips.sun.value_counts()

(Sat     50
 Thur    43
 Sun     34
 Fri      9
 Name: day, dtype: int64,
 0    127
 1      9
 Name: fri, dtype: int64,
 0    86
 1    50
 Name: sat, dtype: int64,
 0    102
 1     34
 Name: sun, dtype: int64)

b) Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

```total_bill``` should be the best predictor. I guess the time of the day will be the 2nd one

In [35]:
#get the target variable
y = tips.tip

In [58]:
X = tips.drop(columns=['tip', 'day'])

In [59]:
X.head(1)

Unnamed: 0,total_bill,sex,smoker,time,size,fri,sat,sun,thur,price_per_person
73,25.28,1,1,1,2,0,1,0,0,0.02


c) Use ```select k best``` to select the top 2 features for predicting ```tip``` amount. What are they?

In [70]:
#SelectKBest(f_regression, k=2).fit_transform(X, y)
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X, y)
X.columns[kbest.get_support()].tolist()

['total_bill', 'price_per_person']

In [61]:
# Best features from kbest -> total_bill, size
# after adding price_per_person -> total_bill and price_per_person

d) Use ```recursive feature elimination``` to select the top 2 features for ```tip amount```. What are they?

In [69]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X, y)
X.columns[rfe.get_support()].tolist()

['sun', 'price_per_person']

In [48]:
# RFE picks 'size' and 'fri' (Friday) as the best estimators
# after adding price_per_person -> sun and price_per_person

e) Why do you think ```select k best``` and ```recursive feature elimination``` might give different answers for the top features? Does this change as you change the number of features you are selecting?

This to algorithms work differently. Select K best looks for strongest correlations, and RFE runs the model over and over again with different features and selects the best performing one

In [65]:
kbest3 = SelectKBest(f_regression, k=3)
kbest3.fit(X, y)

rfe3 = RFE(model, n_features_to_select=3)
rfe3.fit(X, y)

print('Nuber of features = 3')
print(f'K-best features: {X.columns[kbest3.get_support()].tolist()}')
print(f'RFE features: {X.columns[rfe3.get_support()].tolist()}')

Nuber of features = 3
K-best features: ['total_bill', 'size', 'price_per_person']
RFE features: ['time', 'sun', 'price_per_person']


In [66]:
kbest4 = SelectKBest(f_regression, k=4)
kbest4.fit(X, y)

rfe4 = RFE(model, n_features_to_select=4)
rfe4.fit(X, y)

print('Nuber of features = 4')
print(f'K-best features: {X.columns[kbest4.get_support()].tolist()}')
print(f'RFE features: {X.columns[rfe4.get_support()].tolist()}')

Nuber of features = 4
K-best features: ['total_bill', 'size', 'sun', 'price_per_person']
RFE features: ['total_bill', 'time', 'sun', 'price_per_person']


In [76]:
pd.DataFrame(
{
    'rfe_2': rfe.ranking_
},index = X.columns) 

Unnamed: 0,rfe_2
total_bill,3
sex,8
smoker,5
time,2
size,4
fri,7
sat,9
sun,1
thur,6
price_per_person,1


In [74]:
pd.DataFrame(
{
    'rfe_3': rfe3.ranking_
},index = X.columns)

Unnamed: 0,rfe_3
total_bill,2
sex,7
smoker,4
time,1
size,3
fri,6
sat,8
sun,1
thur,5
price_per_person,1


In [77]:
pd.DataFrame(
{
    'rfe_4': rfe4.ranking_
},index = X.columns)

Unnamed: 0,rfe_4
total_bill,1
sex,6
smoker,3
time,1
size,2
fri,5
sat,7
sun,1
thur,4
price_per_person,1


2. **Write a function named ```select_kbest``` that takes in the predictors (```X```), the target (```y```), and the number of features to select (```k```) and returns the names of the top k selected features based on the ```SelectKBest class```. Test your function with the ```tips``` dataset. You should see the same results as when you did the process manually.**

In [82]:
def select_kbest(X, y, k):
    '''
    the function accepts the X_train data set, y_train array and k-number of features to select
    runs the SelectKBest algorithm and returns the list of features to be selected for the modeling
    !KBest doesn't depend on the model
    '''
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()].tolist()

In [83]:
select_kbest(X, y, 3)

['total_bill', 'size', 'price_per_person']

3. **Write a function named ```rfe``` that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the ```RFE class```. Test your function with the ```tips``` dataset. You should see the same results as when you did the process manually.**

In [85]:
def rfe(X, y, k):
    '''
    The function accepts the X_train data set, y_train array and k-number of features to select
    runs the RFE algorithm and returns the list of features to be selected for the modeling
    !RFE depends on the model.
    This function uses Linear regression
    '''
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    return X.columns[rfe.get_support()].tolist()

In [87]:
rfe(X, y, 3)

['time', 'sun', 'price_per_person']

In [86]:
def rfe_model(X, y, model, k):
    '''
    The function accepts the X_train data set, y_train array,
    model (created with hyperparameters) and k-number of features to select
    runs the RFE algorithm and returns the list of features to be selected for the modeling
    '''
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    return X.columns[rfe.get_support()].tolist()

In [88]:
rfe_model(X, y, model, 3)

['time', 'sun', 'price_per_person']

4. **Load the ```swiss``` dataset and use all the other features to predict Fertility. Find the top 3 features using both ```select k best``` and ```recursive feature elimination``` (use the functions you just built to help you out).**

In [17]:
from pydataset import data

In [89]:
swiss = data('swiss')

In [90]:
swiss.head(1)

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2


In [91]:
swiss, _, _ = wr.split_df(swiss)

In [92]:
y_swiss = swiss.Fertility

In [94]:
X_swiss = swiss.drop(columns='Fertility')

In [95]:
X_swiss.head(1)

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Le Locle,16.7,22,13,11.22,18.9


In [96]:
select_kbest(X_swiss, y_swiss, 3)

['Examination', 'Education', 'Catholic']

In [97]:
rfe(X_swiss, y_swiss, 3)

['Agriculture', 'Education', 'Infant.Mortality']