In [27]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import pandas as pd
import seaborn as sns
from math import sqrt

import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import f_regression 
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE


import os
import env

#### Load the tips dataset.

- Create a column named price_per_person. This should be the total bill divided by the party size.
- Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
- Use select k best to select the top 2 features for predicting tip amount. What are they?
- Use recursive feature elimination to select the top 2 features for tip amount. What are they?
- Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?


In [54]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [55]:
df['price_per_person'] = round(df['total_bill']/df['size'], 2)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15


- I beleive total_bill and size are the most important for predicting tip.

In [56]:
df.smoker = df.smoker.map(dict(Yes=1, No=0))
df.sex = df.sex.map(dict(Male=1, Female=0))
df = df.rename(columns = {'sex':'Male'})
df.time = df.time.map(dict(Lunch=1, Dinner=0))
df = df.rename(columns = {'time':'Lunch'})

In [57]:
df.head()

Unnamed: 0,total_bill,tip,Male,smoker,day,Lunch,size,price_per_person
0,16.99,1.01,0,0,Sun,0,2,8.49
1,10.34,1.66,1,0,Sun,0,3,3.45
2,21.01,3.5,1,0,Sun,0,3,7.0
3,23.68,3.31,1,0,Sun,0,2,11.84
4,24.59,3.61,0,0,Sun,0,4,6.15


In [58]:
cols = ['total_bill', 'Male', 'smoker', 'Lunch', 'size', 'price_per_person']
X = df[cols]
y = df.tip
f_selector = SelectKBest(f_regression, k=3)
f_selector.fit(X, df.tip)
feature_mask = f_selector.get_support()
f_feature = X.iloc[:,feature_mask].columns.tolist()

In [59]:
f_feature

['total_bill', 'size', 'price_per_person']

In [60]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=3)

# fit the data using RFE
rfe.fit(X,df.tip)  

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X.iloc[:,feature_mask].columns.tolist()

In [61]:
rfe_feature

['total_bill', 'smoker', 'price_per_person']

- K best uses correlation, while rfe builds models using different combinations of features. I beleive using a combination of both is best for selecting features. With this dataset it doesn't seem to change the features used.

#### Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [87]:
def select_kbest(X, y):
    n = input("How many desired features? ")
    n = int(n)
    f_selector = SelectKBest(f_regression, k=n)
    f_selector.fit(X, y)
    feature_mask = f_selector.get_support()
    f_feature = X.iloc[:,feature_mask].columns.tolist()
    print(f_feature)


In [88]:
select_kbest(X, y)

How many desired features? 3
['Examination', 'Education', 'Catholic']


#### Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.


In [89]:
def rfe(X, y):
    # initialize the ML algorithm
    lm = LinearRegression()
    
    # input for desired features  
    n = input("How many desired features? ")
    n = int(n)

    # create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
    rfe = RFE(lm, n_features_to_select=n)

    # fit the data using RFE
    rfe.fit(X, y)  

    # get the mask of the columns selected
    feature_mask = rfe.support_

    # get list of the column names. 
    rfe_feature = X.iloc[:,feature_mask].columns.tolist()
    
    # prints the ffe features     
    print(rfe_feature)

In [90]:
rfe(X, y)

How many desired features? 3
['Examination', 'Education', 'Infant.Mortality']


#### Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [73]:
from pydataset import data
df = data('swiss')

In [74]:
df.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [75]:
cols = ['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']
X = df[cols]
y = df.Fertility

In [91]:
select_kbest(X, y)

How many desired features? 3
['Examination', 'Education', 'Catholic']


In [92]:
rfe(X, y)

How many desired features? 3
['Examination', 'Education', 'Infant.Mortality']
