In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# pydatasets
from pydataset import data 

# importing libraries/modules, and functions
import pandas as pd
import numpy as np
import scipy.stats as stats

# visualization libraries
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 200
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

# math module for square root
from math import sqrt

# created zillow library and functions
from wrangle import get_zillow_dataset, clean_zillow_dataset, zillow_outliers, train_validate_test_split

# sklearn library for data science
from sklearn.feature_selection import SelectKBest, RFE, f_regression
from sklearn.linear_model import LinearRegression

# just in case :) 
import env
from env import user, password, host, get_connection

----
### ``Feature Engineering Module Exercises:``

1. Load the tips dataset.

   * Create a column named price_per_person. This should be the total bill divided by the party size.
  
     

   





2. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

3. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [2]:
# loading the "tips" dataaset

df = data("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [4]:
# creating a price per person column = total bill / size of the party

df["price_per_person"] = (df["total_bill"] / df["size"]).round(2)

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15


----
### ``2a. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?``

In [6]:
# Before using any of the methods discussed in the lesson, 
# which features do you think would be most important for predicting the tip amount?

# i think:
# total bill, day, time, and size of the party will have the greatest significance in tip amount (target)
# caveat: party size, and total bill may have an interpendency / multicolinearity

In [7]:
# generating dummy variables for categorical columns/features
# dropping redundant columns after concat/merger with dummy variables

dummy_df = pd.get_dummies(df[["sex", "smoker", "day", "time"]])
df = pd.concat([df, dummy_df], axis = 1).drop(columns = ["sex", "smoker", "day", "time"])

In [8]:
# checking the new df
df.head() # checks out!

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
1,16.99,1.01,2,8.49,1,0,1,0,0,0,1,0,1,0
2,10.34,1.66,3,3.45,0,1,1,0,0,0,1,0,1,0
3,21.01,3.5,3,7.0,0,1,1,0,0,0,1,0,1,0
4,23.68,3.31,2,11.84,0,1,1,0,0,0,1,0,1,0
5,24.59,3.61,4,6.15,1,0,1,0,0,0,1,0,1,0


In [9]:
# splitting the dataset
train, validate, test = train_validate_test_split(df)

print(f'trains shape: {train.shape}')
print(f'validate shape: {validate.shape}')
print(f'test shape: {test.shape}')

trains shape: (136, 14)
validate shape: (59, 14)
test shape: (49, 14)


In [10]:
# setting independent/dependent variables in ea. sub-dataset

X_train = train.drop(columns=["tip"])
y_train = train.tip

X_validate = validate.drop(columns=["tip"])
y_validate = validate.tip

X_test = test.drop(columns=["tip"])
y_test = test.tip

----
### ``2b. Use select k best to select the top 2 features for predicting tip amount. What are they?``

* ``Total Bill & Party Size``

In [11]:
# Use select k best to select the top 2 features for predicting tip amount. 
# What are they?

# parameters: f_regression stats test, all features
f_selector = SelectKBest(f_regression, k = 2)

# find the top 2 Independent Variables (X's) correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not
feature_mask = f_selector.get_support()

In [12]:
# get list of top (2) K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

----
### ``2c. Use recursive feature elimination to select the top 2 features for tip amount. What are they?``

In [13]:
# Use recursive feature elimination to select the top 
# 2 features for tip amount. What are they?

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with
# where "n_features_to_select" refers to number of top features to "whittle down" the machine learning algorith to

rfe = RFE(lm, n_features_to_select = 2)

In [14]:
# fit the data using RFE
rfe.fit(X_train,y_train) 

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_features = X_train.iloc[:,feature_mask].columns.tolist()

# which i presume will return "total_bill" & "size"
rfe_features

['size', 'day_Thur']

----
### ``2d. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?``

``Notes: since the recursize feature engineering function begins by pairing all independent variables with one another, and then drops a feature by level of machine learning importance this subsequently paired "day_Thursday" and "size" together as the "strongest" two (2) feature combination.``

* If n_features_to_select > 1, then this has a "relative" change as you increase or decrease the number of features to select.

In [27]:
# top 4 features for tip amount

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with
# where "n_features_to_select" refers to number of top features to "whittle down" the machine learning algorith to

rfe = RFE(lm, n_features_to_select = 4)

In [28]:
# fit the data using RFE
rfe.fit(X_train,y_train) 

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_features = X_train.iloc[:,feature_mask].columns.tolist()

# which i presume will return "total_bill" & "size"
rfe_features

['size', 'price_per_person', 'sex_Female', 'day_Thur']

In [29]:
# view list of columns and their ranking

# get the ranks using "rfe.ranking" method
variable_ranks = rfe.ranking_

# get the variable names
variable_names = X_train.columns.tolist()

# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Feature': variable_names, 'Ranking': variable_ranks})

# sort the df by rank
rfe_ranks_df.sort_values('Ranking')

Unnamed: 0,Feature,Ranking
1,size,1
2,price_per_person,1
3,sex_Female,1
10,day_Thur,1
8,day_Sat,2
0,total_bill,3
4,sex_Male,4
7,day_Fri,5
5,smoker_No,6
9,day_Sun,7


----
### ``3. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. ``

Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [None]:



def select_kbest(X_train, y_train, top_num_of_features):
    # using Select-K-Best to select the top number of features for predicting y variable 

    # parameters: f_regression stats test, all features
    f_selector = SelectKBest(f_regression, k = 2)

    # find the top 2 Independent Variables (X's) correlated with y
    f_selector.fit(X_train, y_train)

    # boolean mask of whether the column was selected or not
    feature_mask = f_selector.get_support()