# Feature Engineering Exercises

In [1]:
import pandas as pd
import numpy as np
import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pydataset.data('tips')

1. Load the tips dataset

In [2]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


a. Create the column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
df['tip_percentage'] = df.tip/df.total_bill
df.tip_percentage

1      0.059447
2      0.160542
3      0.166587
4      0.139780
5      0.146808
         ...   
240    0.203927
241    0.073584
242    0.088222
243    0.098204
244    0.159744
Name: tip_percentage, Length: 244, dtype: float64

b. Create the column named price_per_person. This should be the total bill divided by the party size. 

In [4]:
df['party_size'] = df['size']
df['price_per_person'] = df.total_bill/df.party_size
df.price_per_person

1       8.495000
2       3.446667
3       7.003333
4      11.840000
5       6.147500
         ...    
240     9.676667
241    13.590000
242    11.335000
243     8.910000
244     9.390000
Name: price_per_person, Length: 244, dtype: float64

c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

hypothesis = total_bill is the most important in predicting the tip amount

d. Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

In [5]:
df = df.drop(columns=['size'])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,tip_percentage,party_size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,0.059447,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,0.160542,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,0.166587,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,0.13978,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,0.146808,4,6.1475


In [6]:
df.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [7]:
dummy_df = pd.get_dummies(df[['sex', 'day', 'time', 'smoker']])
dummy_df

Unnamed: 0,sex_Female,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,smoker_No,smoker_Yes
1,1,0,0,0,1,0,1,0,1,0
2,0,1,0,0,1,0,1,0,1,0
3,0,1,0,0,1,0,1,0,1,0
4,0,1,0,0,1,0,1,0,1,0
5,1,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
240,0,1,0,1,0,0,1,0,1,0
241,1,0,0,1,0,0,1,0,0,1
242,0,1,0,1,0,0,1,0,0,1
243,0,1,0,1,0,0,1,0,1,0


In [8]:
df = pd.concat([df, dummy_df], axis=1)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,tip_percentage,party_size,price_per_person,sex_Female,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,smoker_No,smoker_Yes
1,16.99,1.01,Female,No,Sun,Dinner,0.059447,2,8.495,1,0,0,0,1,0,1,0,1,0
2,10.34,1.66,Male,No,Sun,Dinner,0.160542,3,3.446667,0,1,0,0,1,0,1,0,1,0
3,21.01,3.5,Male,No,Sun,Dinner,0.166587,3,7.003333,0,1,0,0,1,0,1,0,1,0
4,23.68,3.31,Male,No,Sun,Dinner,0.13978,2,11.84,0,1,0,0,1,0,1,0,1,0
5,24.59,3.61,Female,No,Sun,Dinner,0.146808,4,6.1475,1,0,0,0,1,0,1,0,1,0


In [9]:
df = df.drop(columns=['sex', 'smoker', 'day', 'time', 'sex_Female', 'time_Lunch', 'smoker_No'])

In [10]:
df.head()

Unnamed: 0,total_bill,tip,tip_percentage,party_size,price_per_person,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,smoker_Yes
1,16.99,1.01,0.059447,2,8.495,0,0,0,1,0,1,0
2,10.34,1.66,0.160542,3,3.446667,1,0,0,1,0,1,0
3,21.01,3.5,0.166587,3,7.003333,1,0,0,1,0,1,0
4,23.68,3.31,0.13978,2,11.84,1,0,0,1,0,1,0
5,24.59,3.61,0.146808,4,6.1475,0,0,0,1,0,1,0


In [11]:
X = df.drop(columns='tip')
y = df.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
kbest = SelectKBest(f_regression, k=3)
kbest.fit(X_train_scaled, y_train)

X_train.columns[kbest.get_support()]

Index(['total_bill', 'tip_percentage', 'party_size'], dtype='object')

In [13]:
pd.Series(X_train.columns, index=kbest.scores_).sort_index(0)

0.000154            smoker_Yes
0.000155               day_Sat
0.691348              day_Thur
1.313965              sex_Male
1.669530           time_Dinner
1.826904               day_Fri
2.352264               day_Sun
26.729545     price_per_person
29.605028       tip_percentage
65.274770           party_size
172.913650          total_bill
dtype: object

# Trying a different method...

In [14]:
tips = pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [15]:
for column in tips.columns:
    if tips[column].dtype == np.number:
        continue
    tips[column] = LabelEncoder().fit_transform(tips[column])

  if tips[column].dtype == np.number:


In [16]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,0,0,2,0,1
2,10.34,1.66,1,0,2,0,2
3,21.01,3.5,1,0,2,0,2
4,23.68,3.31,1,0,2,0,1
5,24.59,3.61,0,0,2,0,3


In [17]:
tips['tip_percentage'] = round((tips.tip/tips.total_bill), 2)
tips['party_size'] = tips['size']
tips['price_per_person'] = round((tips.total_bill/tips.party_size),2)
tips = tips.drop(columns=['size'])
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,tip_percentage,party_size,price_per_person
1,16.99,1.01,0,0,2,0,0.06,1,16.99
2,10.34,1.66,1,0,2,0,0.16,2,5.17
3,21.01,3.5,1,0,2,0,0.17,2,10.5
4,23.68,3.31,1,0,2,0,0.14,1,23.68
5,24.59,3.61,0,0,2,0,0.15,3,8.2


In [28]:
pd.set_option('display.max_rows', None)
tips.shape

(244, 9)

In [30]:
tips.drop(tips[tips.party_size == 0].index, inplace=True)
tips.shape

(240, 9)

In [34]:
X = tips.drop(columns='tip')
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

X_train.columns[kbest.get_support()]

Index(['total_bill', 'party_size'], dtype='object')

In [35]:
pd.Series(X_train.columns, index=kbest.scores_).sort_index(0)

0.277158                   sex
0.333015                smoker
1.003499                   day
1.947235      price_per_person
3.231747                  time
30.686967       tip_percentage
56.086245           party_size
142.847450          total_bill
dtype: object

In [33]:
# from last test: Index(['total_bill', 'tip_percentage', 'party_size'], dtype='object')

## K Best top 2 features  total_bill, party_size

# Recursive

In [42]:
rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

array([ True, False, False, False, False,  True, False, False])

In [43]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'tip_percentage'], dtype='object')

In [44]:
pd.Series(X_train.columns, index=rfe.ranking_).sort_index(0)

1          total_bill
1      tip_percentage
2              smoker
3                 day
4                time
5          party_size
6    price_per_person
7                 sex
dtype: object

## Recursive top 2 features total_bill, tip_percentage

e. Use select k best and recursive feature elimination to select the top 2 features for predicting tip percentage. What are they? 

In [45]:
X = tips.drop(columns='tip_percentage')
y = tips.tip_percentage

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train_scaled, y_train)

pd.Series(X_train.columns, index=kbest.scores_).sort_index(0)

0.057831               smoker
0.336342                  day
0.394301                 time
0.666187                  sex
3.210614           party_size
12.092630    price_per_person
22.918413          total_bill
30.686967                 tip
dtype: object

In [46]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'tip'], dtype='object')

## KBest top 2 total_bill, tip

In [48]:
X = tips.drop(columns='tip_percentage')
y = tips.tip_percentage

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


rfe = RFE(estimator=LinearRegression(), n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)
rfe.get_support()

X_train.columns[rfe.get_support()]

Index(['total_bill', 'tip'], dtype='object')

## Recursive top 2 total_bill, tip

f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

Different ways of calculating top features. Yes

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [53]:
def select_kbest(X, y, k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X_train_scaled, y_train)

    return X_train.columns[kbest.get_support()]

In [54]:
X = tips.drop(columns='tip_percentage')
y = tips.tip_percentage
k = 2

In [55]:
select_kbest(X, y, k)

Index(['total_bill', 'tip'], dtype='object')

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [57]:
def rfe(X, y, k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    rfe = RFE(estimator=LinearRegression(), n_features_to_select=k)
    rfe.fit(X_train_scaled, y_train)
    rfe.get_support()

    return X_train.columns[rfe.get_support()]

In [58]:
rfe(X, y, k)

Index(['total_bill', 'tip'], dtype='object')

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out)

In [59]:
df = pydataset.data('swiss')
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [60]:
X = df.drop(columns='Fertility')
y = df.Fertility
k = 3

In [61]:
select_kbest(X, y, k)

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [62]:
rfe(X, y, k)

Index(['Agriculture', 'Education', 'Catholic'], dtype='object')