In [108]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# pydatasets
from pydataset import data 

# importing libraries/modules, and functions
import pandas as pd
import numpy as np
import scipy.stats as stats

# visualization libraries
import matplotlib as mlp
mlp.rcParams['figure.dpi'] = 200
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

# math module for square root
from math import sqrt

# created zillow library and functions
from wrangle import get_zillow_dataset, clean_zillow_dataset, zillow_outliers, train_validate_test_split

# sklearn library for data science
from sklearn.feature_selection import SelectKBest, RFE, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# just in case :) 
import env
from env import user, password, host, get_connection

----
### ``Feature Engineering Module Exercises:``

In [109]:
# loading the "tips" dataaset

df = data("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


In [111]:
# Creating a price per person column = total bill / size of the party

df["price_per_person"] = (df["total_bill"] / df["size"]).round(2)

In [112]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15


----
### ``2a. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?``

In [113]:
# Before using any of the methods discussed in the lesson, 
# which features do you think would be most important for predicting the tip amount?

# i think:
# total bill, day, time, and size of the party will have the greatest significance in tip amount (target)
# caveat: party size, and total bill may have an interpendency / multicolinearity

In [114]:
# generating dummy variables for categorical columns/features
# dropping redundant columns after concat/merger with dummy variables

dummy_df = pd.get_dummies(df[["sex", "smoker", "day", "time"]], dtype = "bool")
df = pd.concat([df, dummy_df], axis = 1).drop(columns = ["sex", "smoker", "day", "time"])

In [115]:
# checking the new df
df.head() # checks out!

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
1,16.99,1.01,2,8.49,True,False,True,False,False,False,True,False,True,False
2,10.34,1.66,3,3.45,False,True,True,False,False,False,True,False,True,False
3,21.01,3.5,3,7.0,False,True,True,False,False,False,True,False,True,False
4,23.68,3.31,2,11.84,False,True,True,False,False,False,True,False,True,False
5,24.59,3.61,4,6.15,True,False,True,False,False,False,True,False,True,False


In [116]:
cont_cols = df.select_dtypes(include = "number").columns
cont_cols

Index(['total_bill', 'tip', 'size', 'price_per_person'], dtype='object')

In [117]:
# let's now scale the continuous features to not influence the model by "throwing" too many dummy variables
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
df[cont_cols] = scaler.fit_transform(df[cont_cols])
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
1,-0.074675,-1.2096,0.0,0.344011,True,False,True,False,False,False,True,False,True,False
2,-0.691558,-0.7936,1.0,-1.059889,False,True,True,False,False,False,True,False,True,False
3,0.298237,0.384,1.0,-0.071031,False,True,True,False,False,False,True,False,True,False
4,0.545918,0.2624,0.0,1.277159,False,True,True,False,False,False,True,False,True,False
5,0.630334,0.4544,2.0,-0.307799,True,False,True,False,False,False,True,False,True,False


In [118]:
# splitting the dataset
train, validate, test = train_validate_test_split(df)

train shape: (136, 14)
validate shape: (59, 14)
test shape: (49, 14)


In [119]:
# setting independent/dependent variables in ea. sub-dataset

X_train = train.drop(columns = cont_cols)
y_train = train.tip

X_validate = validate.drop(columns = cont_cols)
y_validate = validate.tip

X_test = test.drop(columns = cont_cols)
y_test = test.tip

----
### ``2b. Use select k best to select the top 2 features for predicting tip amount. What are they?``

* ``day_Sun & time_lunch``

In [120]:
# Use select k best to select the top 2 features for predicting tip amount. 
# What are they?

# parameters: f_regression stats test, all features
f_selector = SelectKBest(f_regression, k = 2)

# find the top 2 Independent Variables (X's) correlated with y
f_selector.fit(X_train, y_train)

# boolean mask of whether the column was selected or not
feature_mask = f_selector.get_support()

In [121]:
# get list of top (2) K features. 
f_feature = X_train.iloc[:,feature_mask].columns.tolist()
f_feature

['day_Sun', 'time_Lunch']

----
### ``2c. Use recursive feature elimination to select the top 2 features for tip amount. What are they?``

In [122]:
# Use recursive feature elimination to select the top 
# 2 features for tip amount. What are they?

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with
# where "n_features_to_select" refers to number of top features to "whittle down" the machine learning algorith to

rfe = RFE(lm, n_features_to_select = 2)

In [123]:
# fit the data using RFE
rfe.fit(X_train,y_train) 

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_features = X_train.iloc[:,feature_mask].columns.tolist()

# which i presume will return "total_bill" & "size"
rfe_features

['day_Sun', 'time_Lunch']

----
### ``2d. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?``

``Notes: since the recursize feature engineering function begins by pairing all independent variables with one another, and then drops a feature by level of machine learning importance this subsequently paired "day_Thursday" and "size" together as the "strongest" two (2) feature combination.``

* If n_features_to_select > 1, then this has a "relative" change as you increase or decrease the number of features to select.

In [124]:
# top 4 features for tip amount

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with
# where "n_features_to_select" refers to number of top features to "whittle down" the machine learning algorith to

rfe = RFE(lm, n_features_to_select = 4)

In [125]:
# fit the data using RFE
rfe.fit(X_train,y_train) 

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_features = X_train.iloc[:,feature_mask].columns.tolist()

# which i presume will return "total_bill" & "size"
rfe_features

['sex_Male', 'day_Fri', 'day_Sun', 'time_Lunch']

In [126]:
# view list of columns and their ranking

# get the ranks using "rfe.ranking" method
variable_ranks = rfe.ranking_

# get the variable names
variable_names = X_train.columns.tolist()

# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Feature': variable_names, 'Ranking': variable_ranks})

# sort the df by rank
rfe_ranks_df.sort_values('Ranking')

Unnamed: 0,Feature,Ranking
1,sex_Male,1
4,day_Fri,1
6,day_Sun,1
9,time_Lunch,1
8,time_Dinner,2
2,smoker_No,3
0,sex_Female,4
3,smoker_Yes,5
5,day_Sat,6
7,day_Thur,7


----
### ``3. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. ``

Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [127]:
# creating the Kbest function for future use:

def select_kbest(X_train, y_train, number_of_top_features):
    # using Select-K-Best to select the top number of features for predicting y variable 
    # parameters: f_regression stats test, all features
    f_selector = SelectKBest(f_regression, k = number_of_top_features)

    # find the top number of independent variables (X's) correlated with y
    f_selector.fit(X_train, y_train)

    # boolean mask of whether the column was selected or not
    feature_mask = f_selector.get_support()

    # get list of top (2) K features. 
    f_feature = X_train.iloc[:,feature_mask].columns.tolist()
    
    return f_feature

----
### ``4. Write a function named rfe that takes in the predictors, the target, and the number of features to select.``

* It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [128]:
# creating the recursive feature engineering function for future use:

def recursive_feature_eng(X_train, y_train, number_of_top_features):

    # initialize the ML algorithm
    lm = LinearRegression()

    rfe = RFE(lm, n_features_to_select = number_of_top_features)

    # fit the data using RFE
    rfe.fit(X_train,y_train) 

    # get the mask of the columns selected
    feature_mask = rfe.support_

    # get list of the column names. 
    rfe_features = X_train.iloc[:,feature_mask].columns.tolist()
    
    # returning the list of features
    print(f'The top {number_of_top_features} are: {rfe_features}') 

    # view list of columns and their ranking
    # get the ranks using "rfe.ranking" method
    variable_ranks = rfe.ranking_

    # get the variable names
    variable_names = X_train.columns.tolist()

    # combine ranks and names into a df for clean viewing
    rfe_ranks_df = pd.DataFrame({'Feature': variable_names, 'Ranking': variable_ranks})

    # sort the df by rank
    return rfe_ranks_df.sort_values('Ranking')

In [129]:
recursive_feature_eng(X_train, y_train, 4)

The top 4 are: ['sex_Male', 'day_Fri', 'day_Sun', 'time_Lunch']


Unnamed: 0,Feature,Ranking
1,sex_Male,1
4,day_Fri,1
6,day_Sun,1
9,time_Lunch,1
8,time_Dinner,2
2,smoker_No,3
0,sex_Female,4
3,smoker_Yes,5
5,day_Sat,6
7,day_Thur,7


----
### ``5. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).``

In [130]:
# loading swiss dataset from pydataset

swiss_df = data("swiss")
swiss_df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [131]:
swiss_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [132]:
# now let's clean the column names
from skimpy import clean_columns

swiss_df = clean_columns(swiss_df, case = "snake")
swiss_df.head() # checks out!

Unnamed: 0,fertility,agriculture,examination,education,catholic,infant_mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [133]:
swiss_df.shape
print(swiss_df.columns)

Index(['fertility', 'agriculture', 'examination', 'education', 'catholic',
       'infant_mortality'],
      dtype='object')


In [134]:
# let's scale the data for better comparison 

scaler = RobustScaler()
swiss_df = pd.DataFrame(scaler.fit_transform(swiss_df), columns = swiss_df.columns)

In [140]:
swiss_df.head()

Unnamed: 0,fertility,agriculture,examination,education,catholic,infant_mortality
0,0.712727,-1.168504,-0.1,0.666667,-0.05891,0.619718
1,0.923636,-0.283465,-1.0,0.166667,0.792676,0.619718
2,1.607273,-0.453543,-1.1,-0.5,0.890026,0.056338
3,1.12,-0.554331,-0.4,-0.166667,0.211873,0.084507
4,0.472727,-0.333858,0.1,1.166667,-0.113499,0.169014


In [137]:
# let's split the data

train_validate, test = train_test_split(
    swiss_df, test_size=0.25, random_state=123)
    
train, validate = train_test_split(
        train_validate,
        test_size=0.4,
        random_state=123)

print(f'train shape: {train.shape}')
print(f'validate shape: {validate.shape}')
print(f'test shape: {test.shape}')

train shape: (21, 6)
validate shape: (14, 6)
test shape: (12, 6)


In [138]:
# second split

X_train = train.drop(columns=["fertility"])
y_train = train.fertility

X_validate = validate.drop(columns=["fertility"])
y_validate = validate.fertility

X_test = test.drop(columns=["fertility"])
y_test = test.fertility

In [146]:
# comparing kbest and recursive feature engineering functions using personally created funcitons
# kbest 

from wrangle import select_kbest, recursive_feature_eng

print(f'Select top (3) Kbest from swiss dataset: {select_kbest(X_train, y_train, 3)}')
print()
recursive_feature_eng(X_train, y_train, 3)

Select top (3) Kbest from swiss dataset: ['examination', 'education', 'catholic']

The top 3 are: ['agriculture', 'examination', 'education']


Unnamed: 0,Feature,Ranking
0,agriculture,1
1,examination,1
2,education,1
3,catholic,2
4,infant_mortality,3
