# Feature Engineering Exercises

In [1]:
import pandas as pd
import numpy as np
import pydataset

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pydataset.data('tips')

1. Load the tips dataset

In [2]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


a. Create the column named tip_percentage. This should be the tip amount divided by the total bill.

In [3]:
df['tip_percentage'] = df.tip/df.total_bill
df.tip_percentage

1      0.059447
2      0.160542
3      0.166587
4      0.139780
5      0.146808
         ...   
240    0.203927
241    0.073584
242    0.088222
243    0.098204
244    0.159744
Name: tip_percentage, Length: 244, dtype: float64

b. Create the column named price_per_person. This should be the total bill divided by the party size. 

In [4]:
df['party_size'] = df['size']
df['price_per_person'] = df.total_bill/df.party_size
df.price_per_person

1       8.495000
2       3.446667
3       7.003333
4      11.840000
5       6.147500
         ...    
240     9.676667
241    13.590000
242    11.335000
243     8.910000
244     9.390000
Name: price_per_person, Length: 244, dtype: float64

c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

hypothesis = total_bill is the most important in predicting the tip amount

d. Use select k best and recursive feature elimination to select the top 2 features for predicting tip amount. What are they?

In [5]:
df = df.drop(columns=['size'])
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,tip_percentage,party_size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,0.059447,2,8.495
2,10.34,1.66,Male,No,Sun,Dinner,0.160542,3,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,0.166587,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,0.13978,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,0.146808,4,6.1475


In [6]:
df.day.value_counts()

Sat     87
Sun     76
Thur    62
Fri     19
Name: day, dtype: int64

In [7]:
dummy_df = pd.get_dummies(df[['sex', 'day', 'time', 'smoker']])
dummy_df

Unnamed: 0,sex_Female,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,smoker_No,smoker_Yes
1,1,0,0,0,1,0,1,0,1,0
2,0,1,0,0,1,0,1,0,1,0
3,0,1,0,0,1,0,1,0,1,0
4,0,1,0,0,1,0,1,0,1,0
5,1,0,0,0,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
240,0,1,0,1,0,0,1,0,1,0
241,1,0,0,1,0,0,1,0,0,1
242,0,1,0,1,0,0,1,0,0,1
243,0,1,0,1,0,0,1,0,1,0


In [8]:
df = pd.concat([df, dummy_df], axis=1)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,tip_percentage,party_size,price_per_person,sex_Female,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,smoker_No,smoker_Yes
1,16.99,1.01,Female,No,Sun,Dinner,0.059447,2,8.495,1,0,0,0,1,0,1,0,1,0
2,10.34,1.66,Male,No,Sun,Dinner,0.160542,3,3.446667,0,1,0,0,1,0,1,0,1,0
3,21.01,3.5,Male,No,Sun,Dinner,0.166587,3,7.003333,0,1,0,0,1,0,1,0,1,0
4,23.68,3.31,Male,No,Sun,Dinner,0.13978,2,11.84,0,1,0,0,1,0,1,0,1,0
5,24.59,3.61,Female,No,Sun,Dinner,0.146808,4,6.1475,1,0,0,0,1,0,1,0,1,0


In [9]:
df = df.drop(columns=['sex', 'smoker', 'day', 'time', 'sex_Female', 'time_Lunch', 'smoker_No'])

In [10]:
df.head()

Unnamed: 0,total_bill,tip,tip_percentage,party_size,price_per_person,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,smoker_Yes
1,16.99,1.01,0.059447,2,8.495,0,0,0,1,0,1,0
2,10.34,1.66,0.160542,3,3.446667,1,0,0,1,0,1,0
3,21.01,3.5,0.166587,3,7.003333,1,0,0,1,0,1,0
4,23.68,3.31,0.13978,2,11.84,1,0,0,1,0,1,0
5,24.59,3.61,0.146808,4,6.1475,0,0,0,1,0,1,0


In [11]:
X = df.drop(columns='tip')
y = df.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
kbest = SelectKBest(f_regression, k=3)
kbest.fit(X_train_scaled, y_train)

X_train.columns[kbest.get_support()]

Index(['total_bill', 'tip_percentage', 'party_size'], dtype='object')

In [13]:
pd.Series(X_train.columns, index=kbest.scores_).sort_index(0)

0.000154            smoker_Yes
0.000155               day_Sat
0.691348              day_Thur
1.313965              sex_Male
1.669530           time_Dinner
1.826904               day_Fri
2.352264               day_Sun
26.729545     price_per_person
29.605028       tip_percentage
65.274770           party_size
172.913650          total_bill
dtype: object

In [14]:
tips = pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [15]:
for column in tips.columns:
    if tips[column].dtype == np.number:
        continue
    tips[column] = LabelEncoder().fit_transform(tips[column])

  if tips[column].dtype == np.number:


In [16]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,0,0,2,0,1
2,10.34,1.66,1,0,2,0,2
3,21.01,3.5,1,0,2,0,2
4,23.68,3.31,1,0,2,0,1
5,24.59,3.61,0,0,2,0,3


In [17]:
tips['tip_percentage'] = round((tips.tip/tips.total_bill), 2)
tips['party_size'] = tips['size']
tips['price_per_person'] = round((tips.total_bill/tips.party_size),2)
tips = tips.drop(columns=['size'])
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,tip_percentage,party_size,price_per_person
1,16.99,1.01,0,0,2,0,0.06,1,16.99
2,10.34,1.66,1,0,2,0,0.16,2,5.17
3,21.01,3.5,1,0,2,0,0.17,2,10.5
4,23.68,3.31,1,0,2,0,0.14,1,23.68
5,24.59,3.61,0,0,2,0,0.15,3,8.2


In [18]:
X = tips.drop(columns='tip')
y = tips.tip

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

kbest = SelectKBest(f_regression, k=3)
kbest.fit(X_train_scaled, y_train)

X_train.columns[kbest.get_support()]

ValueError: Input contains infinity or a value too large for dtype('float64').

In [None]:
pd.Series(X_train.columns, index=kbest.scores_).sort_index(0)