# Feature Engineering Exercises

Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

In [59]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from pydataset import data
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression, RFE, mutual_info_regression
from sklearn.linear_model import LinearRegression

import wrangle as wg

## 1. Load the tips dataset.

    a. Create a column named price_per_person. This should be the total bill divided by the party size.
    
    b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
    
    c. Use Select K Best to select the top 2 features for predicting tip amount. What are they?
    
    d. Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?
    
    e. Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

In [7]:
df = data('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [14]:
df['price_per_person'] = df.total_bill/df['size']
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


In [None]:
#b. I think the most important feature will be total_bill and then size.

In [17]:
#c. Use Select K Best to select the top 2 features for predicting tip amount. What are they?

train, val, test = wg.train_val_test(df)

In [26]:
to_scale = ['total_bill', 'size', 'price_per_person']

In [20]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size',
       'price_per_person'],
      dtype='object')

In [18]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
190,23.1,4.0,Male,Yes,Sun,Dinner,3,7.7
146,8.35,1.5,Female,No,Thur,Lunch,2,4.175
75,14.73,2.2,Female,No,Sat,Dinner,2,7.365
235,15.53,3.0,Male,Yes,Sat,Dinner,2,7.765
199,13.0,2.0,Female,Yes,Thur,Lunch,2,6.5


In [27]:
train, val, test = wg.scale(df, scaled_cols=to_scale)

In [30]:
train.select_dtypes(object).columns

Index(['sex', 'smoker', 'day', 'time'], dtype='object')

In [28]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
190,0.419564,4.0,Male,Yes,Sun,Dinner,0.4,0.277299
146,0.110599,1.5,Female,No,Thur,Lunch,0.2,0.074713
75,0.24424,2.2,Female,No,Sat,Dinner,0.2,0.258046
235,0.260997,3.0,Male,Yes,Sat,Dinner,0.2,0.281034
199,0.208002,2.0,Female,Yes,Thur,Lunch,0.2,0.208333


In [37]:
df.nunique()

total_bill          229
tip                 123
sex                   2
smoker                2
day                   4
time                  2
size                  6
price_per_person    235
dtype: int64

In [38]:
train = pd.get_dummies(train, columns=['sex', 'smoker', 'time'], drop_first=True)

In [39]:
train.head()

Unnamed: 0,total_bill,tip,day,size,price_per_person,sex_Male,smoker_Yes,time_Lunch
190,0.419564,4.0,Sun,0.4,0.277299,1,1,0
146,0.110599,1.5,Thur,0.2,0.074713,0,0,1
75,0.24424,2.2,Sat,0.2,0.258046,0,0,0
235,0.260997,3.0,Sat,0.2,0.281034,1,1,0
199,0.208002,2.0,Thur,0.2,0.208333,0,1,1


In [40]:
train = pd.get_dummies(train, columns=['day'], drop_first=False)

In [41]:
train.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Male,smoker_Yes,time_Lunch,day_Fri,day_Sat,day_Sun,day_Thur
190,0.419564,4.0,0.4,0.277299,1,1,0,0,0,1,0
146,0.110599,1.5,0.2,0.074713,0,0,1,0,0,0,1
75,0.24424,2.2,0.2,0.258046,0,0,0,0,1,0,0
235,0.260997,3.0,0.2,0.281034,1,1,0,0,1,0,0
199,0.208002,2.0,0.2,0.208333,0,1,1,0,0,0,1


In [42]:
val = pd.get_dummies(val, columns=['sex', 'smoker', 'time'], drop_first=True)
val = pd.get_dummies(val, columns=['day'], drop_first=False)

In [43]:
X = train.drop(columns = ['tip'])
y = train.tip

In [44]:
skb = SelectKBest(f_regression, k=2)

skb.fit(X, y)

In [46]:
skb_mask = skb.get_support()
X.columns[skb_mask]

Index(['total_bill', 'size'], dtype='object')

In [57]:
X_train = train.drop(columns = ['tip'])
y_train = train.tip

In [51]:
skb = SelectKBest(mutual_info_regression, k=2)

skb.fit(X_train, y_train)
skb_mask = skb.get_support()
X_train.columns[skb_mask]

Index(['total_bill', 'size'], dtype='object')

In [64]:
stats.pearsonr(X['size'], X['price_per_person'])

PearsonRResult(statistic=-0.2253230063798753, pvalue=0.0031341543777673297)

In [65]:
stats.pearsonr(X['price_per_person'], X['total_bill'])

PearsonRResult(statistic=0.6706478352456997, pvalue=1.4618050047883702e-23)

In [54]:
X_train.head(3)

Unnamed: 0,total_bill,size,price_per_person,sex_Male,smoker_Yes,time_Lunch,day_Fri,day_Sat,day_Sun,day_Thur
190,0.419564,0.4,0.277299,1,1,0,0,0,1,0
146,0.110599,0.2,0.074713,0,0,1,0,0,0,1
75,0.24424,0.2,0.258046,0,0,0,0,1,0,0


In [56]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select=2)

rfe.fit(X,y)

rfe_mask = rfe.get_support()

X.columns[rfe_mask]

Index(['total_bill', 'price_per_person'], dtype='object')

In [67]:
for n in range(1,6):
    lm = LinearRegression()

    rfe = RFE(lm, n_features_to_select=n)

    rfe.fit(X,y)

    rfe_mask = rfe.get_support()

    print(list(X.columns[rfe_mask]))

['total_bill']
['total_bill', 'price_per_person']
['total_bill', 'price_per_person', 'time_Lunch']
['total_bill', 'price_per_person', 'time_Lunch', 'day_Sat']
['total_bill', 'price_per_person', 'smoker_Yes', 'time_Lunch', 'day_Sat']


In [69]:
for n in range(1,6):
    skb = SelectKBest(mutual_info_regression, k=n)
    skb.fit(X_train, y_train)
    skb_mask = skb.get_support()
    print(list(X_train.columns[skb_mask]))

['total_bill']
['total_bill', 'price_per_person']
['total_bill', 'size', 'price_per_person']
['total_bill', 'size', 'price_per_person', 'sex_Male']
['total_bill', 'size', 'price_per_person', 'smoker_Yes', 'day_Sat']


## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [79]:
def select_kbest(number, x, y):
    skb = SelectKBest(mutual_info_regression, k=number)
    skb.fit(x, y)
    skb_mask = skb.get_support()
    print(f' {number}  {list(x.columns[skb_mask])}')

In [82]:
select_kbest(2, X_train, y_train)

 2  ['total_bill', 'price_per_person']


## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [84]:
def rfe_(number, x, y):
    lm = LinearRegression()

    rfe = RFE(lm, n_features_to_select=number)

    rfe.fit(x,y)

    rfe_mask = rfe.get_support()

    print(list(x.columns[rfe_mask]))

In [85]:
rfe_(3, X_train, y_train)

['total_bill', 'price_per_person', 'time_Lunch']


## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [87]:
sw = data('swiss')

In [90]:
sc = ['Agriculture', 'Examination', 'Education', 'Catholic',
       'Infant.Mortality']
train, val, test = wg.scale(sw, scaled_cols=sc)

In [92]:
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Glane,92.4,0.786305,0.323529,0.134615,0.973862,1.0
V. De Geneve,35.0,0.0,1.0,1.0,0.411952,0.510638
La Vallee,54.3,0.165289,0.823529,0.365385,0.0,0.0
Payerne,74.2,0.671783,0.323529,0.134615,0.03157,0.921986
Lavaux,65.1,0.847698,0.470588,0.153846,0.007073,0.652482


In [93]:
X = sw.drop(columns = ['Fertility'])
y = sw.Fertility

In [96]:
select_kbest(3, X, y)

 3  ['Examination', 'Education', 'Catholic']


In [95]:
rfe_(3, X, y)

['Examination', 'Education', 'Infant.Mortality']
