In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import f_regression, RFE, SelectKBest
import sklearn.preprocessing

from math import sqrt
import seaborn as sns
import warnings
from pydataset import data

from env import host, user, password
import wrangle
import feature_engineering as fe
warnings.filterwarnings('ignore')

1. Load the tips dataset.

    a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

    b. Create a column named price_per_person. This should be the total bill divided by the party size.


In [2]:
df = data('tips')

In [3]:
df.size = df['size'].astype(int)

In [4]:
df['tip_percentage'] = (df.tip / df.total_bill)

In [5]:
df['price_per_person'] = round(df.total_bill / df['size'], 2)

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.45
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.0
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.15


c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

**total_bill, size, time / total_bill, size, time**

d. Use all the other numeric features to predict **tip amount.** Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import env

import warnings
warnings.filterwarnings("ignore")

In [8]:
target = 'tip'
target2 = 'tip_percentage'

In [9]:
X_train, y_train, X_validate, y_validate, X_test, y_test = fe.train_validate_test(df, target)

In [10]:
object_cols = fe.get_object_cols(df)
numeric_cols = fe.get_numeric_X_cols(X_train, object_cols)

In [11]:
X_train_scaled, X_validate_scaled, X_test_scaled = fe.min_max_scale(X_train, X_validate, X_test, numeric_cols)

In [12]:
X_train_scaled.columns.tolist()

['total_bill', 'size', 'tip_percentage', 'price_per_person']

In [13]:
# parameters: f_regression stats test, k = # of features (exercise says top 2)
f_selector = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, 2)

# fit the data using RFE
rfe.fit(X_train_scaled,y_train)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

In [15]:
f_feature, rfe_feature

(['total_bill', 'size'], ['total_bill', 'tip_percentage'])

d. SelectKBest = **total_bill and size** and RFE = **total_bill and tip_percentage**

e. Use all the other numeric features to predict **tip percentage**. Use select k best and recursive feature elimination to select the top 2 features. What are they?


In [16]:
X_train2, y_train2, X_validate2, y_validate2, X_test2, y_test2 = fe.train_validate_test(df, target2)

In [17]:
object_cols2 = fe.get_object_cols(df)
numeric_cols2 = fe.get_numeric_X_cols(X_train2, object_cols2)

In [18]:
X_train_scaled2, X_validate_scaled2, X_test_scaled2 = fe.min_max_scale(X_train2, X_validate2, X_test2, numeric_cols2)

In [19]:
X_train_scaled2.columns.tolist()

['total_bill', 'tip', 'size', 'price_per_person']

In [20]:
# parameters: f_regression stats test, k = # of features (exercise says top 2)
f_selector2 = SelectKBest(f_regression, k=2)

# find the top 8 X's correlated with y
f_selector2.fit(X_train_scaled2, y_train2)

# boolean mask of whether the column was selected or not. 
feature_mask2 = f_selector2.get_support()

# get list of top K features. 
f_feature2 = X_train_scaled2.iloc[:,feature_mask2].columns.tolist()

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe2 = RFE(lm, 2)

# fit the data using RFE
rfe2.fit(X_train_scaled2,y_train2)  

# get the mask of the columns selected
feature_mask2 = rfe2.support_

# get list of the column names. 
rfe_feature2 = X_train_scaled2.iloc[:,feature_mask2].columns.tolist()

In [22]:
f_feature2, rfe_feature2

(['tip', 'price_per_person'], ['total_bill', 'tip'])

e. SelectKBest = **tip and price_per_person** and RFE = **total_bill and tip**

f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?


2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [23]:
fe.select_kbest(df, target, 2)

['total_bill', 'size']

In [24]:
fe.rfe(df, target, 2)

['total_bill', 'tip_percentage']

In [25]:
fe.select_kbest(df, target2, 2)

['tip', 'price_per_person']

In [26]:
fe.rfe(df, target2, 2)

['total_bill', 'tip']

# Exercise 4

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).



In [27]:
df = data('swiss')

In [28]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [30]:
target = 'Fertility'

In [31]:
fe.select_kbest(df, target, 3)

['Examination', 'Catholic', 'Infant.Mortality']

In [32]:
fe.rfe(df, target, 3)

['Agriculture', 'Examination', 'Infant.Mortality']