## 1. Load the tips dataset.

In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from pydataset import data
from sklearn.linear_model import LinearRegression

In [2]:
df = data('tips')

In [3]:
df.dtypes

total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

### a. Create a column named price_per_person. This should be the total bill divided by the party size.

In [4]:
df['price_per_person'] = df['total_bill']/df['size']

In [5]:
dummy_df = pd.get_dummies(df[['sex', 'smoker', 'day', 'time']], dummy_na=False, drop_first=True)
df = pd.concat([df, dummy_df], axis=1)

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495,0,0,0,1,0,0
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667,1,0,0,1,0,0
3,21.01,3.5,Male,No,Sun,Dinner,3,7.003333,1,0,0,1,0,0
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84,1,0,0,1,0,0
5,24.59,3.61,Female,No,Sun,Dinner,4,6.1475,0,0,0,1,0,0


In [7]:
train, test = train_test_split(df, random_state=123, train_size=.8)

In [8]:
X_train = train[['total_bill', 'size', 'sex_Male', 'smoker_Yes', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Lunch', 'price_per_person']]
y_train = train.tip

X_test = test[['total_bill', 'size', 'sex_Male', 'smoker_Yes', 'day_Sat', 'day_Sun', 'day_Thur', 'time_Lunch', 'price_per_person']]
y_test = test.tip

### b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

In [9]:
# total_bill, price_per_person?

### c. Use select k best to select the top 2 features for predicting tip amount. What are they?

In [11]:
pd.options.display.float_format = '{:,.2f}'.format

In [9]:
# Creat the object

kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7ff8a102d940>)

In [12]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,0.0,172.91
size,0.0,65.27
sex_Male,0.25,1.31
smoker_Yes,0.99,0.0
day_Sat,0.99,0.0
day_Sun,0.13,2.35
day_Thur,0.41,0.69
time_Lunch,0.2,1.67
price_per_person,0.0,26.73


In [13]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [None]:
# 'total_bill', 'size'

##### changing features to 3

In [19]:
kbest2 = SelectKBest(f_regression, k=3)
kbest2.fit(X_train, y_train)

SelectKBest(k=3, score_func=<function f_regression at 0x7ff8a102d940>)

In [20]:
kbest2_results = pd.DataFrame(dict(p=kbest2.pvalues_, f=kbest2.scores_), index=X_train.columns)
kbest2_results

Unnamed: 0,p,f
total_bill,0.0,172.91
size,0.0,65.27
sex_Male,0.25,1.31
smoker_Yes,0.99,0.0
day_Sat,0.99,0.0
day_Sun,0.13,2.35
day_Thur,0.41,0.69
time_Lunch,0.2,1.67
price_per_person,0.0,26.73


In [21]:
X_train.columns[kbest2.get_support()]

Index(['total_bill', 'size', 'price_per_person'], dtype='object')

In [22]:
# 'total_bill', 'size', 'price_per_person'

### d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?

## RFE

- Recursive Feature Elimination
- Progressively eliminate features based on importance to the model
- Requires a model with either a `.coef_` or `.feature_importances_` property
- After fitting: `.ranking_`, `.get_support()`, and `.transform()`

In [15]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [16]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,2
size,1
sex_Male,5
smoker_Yes,8
day_Sat,1
day_Sun,6
day_Thur,4
time_Lunch,7
price_per_person,3


In [17]:
X_train.columns[rfe.get_support()]

Index(['size', 'day_Sat'], dtype='object')

In [18]:
X_train_transformed = pd.DataFrame(
    rfe.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[rfe.support_]
)
X_train_transformed.head()

Unnamed: 0,size,day_Sat
24,4.0,1.0
191,2.0,0.0
210,2.0,1.0
11,2.0,0.0
197,2.0,0.0


### e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

##### changing RFE n = 3

In [23]:
model2 = LinearRegression()
rfe2 = RFE(model, n_features_to_select=3)
rfe2.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=3)

In [24]:
pd.DataFrame({'rfe_ranking': rfe2.ranking_}, index=X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,1
size,1
sex_Male,4
smoker_Yes,7
day_Sat,1
day_Sun,5
day_Thur,3
time_Lunch,6
price_per_person,2


In [25]:
X_train.columns[rfe2.get_support()]

Index(['total_bill', 'size', 'day_Sat'], dtype='object')

In [26]:
X_train_transformed = pd.DataFrame(
    rfe2.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[rfe2.support_]
)
X_train_transformed.head()

Unnamed: 0,total_bill,size,day_Sat
24,39.42,4.0,1.0
191,15.69,2.0,0.0
210,12.76,2.0,1.0
11,10.27,2.0,0.0
197,10.34,2.0,0.0


In [None]:
# 'total_bill', 'size', 'day_Sat'

## 2. Write a function named `select_kbest` that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [35]:
def select_kbest(X,y,k):
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()]

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [36]:
def rfe(X,y,n):
    model = LinearRegression()
    rfe = RFE(model, n_features_to_select=n)
    rfe.fit(X, y)
    return X.columns[rfe.get_support()]

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [27]:
swiss = data('swiss')

In [31]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [29]:
swiss.shape

(47, 6)

In [30]:
train, test = train_test_split(swiss, random_state=123, train_size=.8)

In [33]:
X_train = train[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']]
y_train = train.Fertility

X_test = test[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']]
y_test = test.Fertility

In [37]:
select_kbest(X_train, y_train, 2)

Index(['Examination', 'Education'], dtype='object')

In [39]:
rfe(X_train, y_train,2)

Index(['Examination', 'Infant.Mortality'], dtype='object')

In [44]:
rfe(X_train, y_train, 3)

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')