In [1]:
import pandas as pd
import numpy as np
from pydataset import data

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import wrangle

import warnings
warnings.filterwarnings("ignore")

### 1. Load the tips dataset.

In [2]:
df = data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


### a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.


In [4]:
df['tip_percentage'] = round(df['tip']/df['total_bill'],3)

### b. Create a column named price_per_person. This should be the total bill divided by the party size.


In [5]:
df['price_per_person'] = round(df['total_bill']/df['size'],3)

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.161,3.447
3,21.01,3.5,Male,No,Sun,Dinner,3,0.167,7.003
4,23.68,3.31,Male,No,Sun,Dinner,2,0.14,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.147,6.148


### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

- `total_bill`
- `price_per_person`
-`size`

### d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [7]:
# split df

train, validate, test = wrangle.split_data(df)

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 19 to 167
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        136 non-null    float64
 1   tip               136 non-null    float64
 2   sex               136 non-null    object 
 3   smoker            136 non-null    object 
 4   day               136 non-null    object 
 5   time              136 non-null    object 
 6   size              136 non-null    int64  
 7   tip_percentage    136 non-null    float64
 8   price_per_person  136 non-null    float64
dtypes: float64(4), int64(1), object(4)
memory usage: 10.6+ KB


In [9]:
num_cols = train.columns[train.dtypes != object]

In [10]:
num_cols

Index(['total_bill', 'tip', 'size', 'tip_percentage', 'price_per_person'], dtype='object')

In [11]:
X_train = train.drop(columns = ['tip', 'sex', 'smoker', 'day', 'time'])
X_validate = validate.drop(columns = ['tip', 'sex', 'smoker', 'day', 'time'])
X_test = test.drop(columns = ['tip', 'sex', 'smoker', 'day', 'time'])

y_train = train[['tip']]

In [12]:
# scale the features
scaler, X_train_scaled, X_validate_scaled, X_test_scaled = wrangle.Min_Max_scaler(X_train, X_validate, X_test)

In [13]:
X_train_scaled.columns.tolist()

['total_bill', 'size', 'tip_percentage', 'price_per_person']

#### Using Select K Best to select top 2 features

In [14]:
f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(X_train_scaled, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7f8407d5c940>)

In [15]:
mask = f_selector.get_support()
X_train_scaled.columns[mask]

Index(['total_bill', 'size'], dtype='object')

#### Using RFE to select top 2 features

In [16]:
model = LinearRegression().fit(X_train_scaled, y_train)
model.coef_

array([[ 6.07945914,  0.74051986,  8.42809522, -0.18176765]])

In [17]:
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=2)
rfe.fit(X_train_scaled, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [18]:
X_train_scaled.columns[rfe.support_]

Index(['total_bill', 'tip_percentage'], dtype='object')

In [19]:
pd.Series(dict(zip(X_train_scaled.columns, rfe.ranking_))).sort_values()

total_bill          1
tip_percentage      1
size                2
price_per_person    3
dtype: int64

### e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [20]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
19,16.97,3.5,Female,No,Sun,Dinner,3,0.206,5.657
173,7.25,5.15,Male,Yes,Sun,Dinner,2,0.71,3.625
119,12.43,1.8,Female,No,Thur,Lunch,2,0.145,6.215
29,21.7,4.3,Male,No,Sat,Dinner,2,0.198,10.85
238,32.83,1.17,Male,Yes,Sat,Dinner,2,0.036,16.415


In [21]:
X_train2 = train.drop(columns = ['tip_percentage', 'sex', 'smoker', 'day', 'time'])
X_validate2 = validate.drop(columns = ['tip_percentage', 'sex', 'smoker', 'day', 'time'])
X_test2 = test.drop(columns = ['tip_percentage', 'sex', 'smoker', 'day', 'time'])

y_train2 = train[['tip_percentage']]

In [22]:
# scale the features
scaler2, X_train_scaled2, X_validate_scaled2, X_test_scaled2 = wrangle.Min_Max_scaler(X_train2, X_validate2, X_test2)

In [23]:
X_train_scaled2.columns.tolist()

['total_bill', 'tip', 'size', 'price_per_person']

#### Using Select K Best to select top 2 features

In [24]:
f_selector = SelectKBest(score_func=f_regression, k=2)
f_selector.fit(X_train_scaled2, y_train2)

SelectKBest(k=2, score_func=<function f_regression at 0x7f8407d5c940>)

In [25]:
mask = f_selector.get_support()
X_train_scaled2.columns[mask]

Index(['tip', 'price_per_person'], dtype='object')

#### Using RFE to select top 2 features

In [26]:
model = LinearRegression().fit(X_train_scaled2, y_train2)
model.coef_

array([[-0.29028085,  0.42290477, -0.08073965, -0.0676497 ]])

In [27]:
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=2)
rfe.fit(X_train_scaled2, y_train2)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [28]:
X_train_scaled2.columns[rfe.support_]

Index(['total_bill', 'tip'], dtype='object')

In [29]:
pd.Series(dict(zip(X_train_scaled2.columns, rfe.ranking_))).sort_values()

total_bill          1
tip                 1
size                2
price_per_person    3
dtype: int64

### f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [50]:
def select_kbest(X, y, k):
    
    f_selector = SelectKBest(f_regression, k)
    f_selector.fit(X_train_scaled, y_train)
    mask = f_selector.get_support()    
    f_feature = X_train_scaled.columns[mask]

    return f_feature

In [51]:
X_train_scaled.columns.tolist()

['total_bill', 'size', 'tip_percentage', 'price_per_person']

In [52]:
select_kbest(X_train_scaled, y_train, 2)

Index(['total_bill', 'size'], dtype='object')

## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [48]:
def rfe(X, y, n):
    
    lm = LinearRegression()
    rfe = RFE(lm, n)
    rfe.fit(X_train_scaled, y_train)
    feat_selected = X_train_scaled.columns[rfe.support_]
    
    return feat_selected

In [49]:
rfe(X_train_scaled, y_train, 2)

Index(['total_bill', 'tip_percentage'], dtype='object')

## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [35]:
swiss = data('swiss')

In [36]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [37]:
# split df

train, validate, test = wrangle.split_data(swiss)

In [38]:
train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,60.5,60.8,16,10,7.72,16.3
Lavaux,65.1,73.0,19,9,2.84,20.0
Nyone,56.6,50.9,22,12,15.14,16.7
Conthey,75.5,85.9,3,2,99.71,15.1
Yverdon,65.4,49.5,15,8,6.1,22.5


In [39]:
X_train3 = train.drop(columns = ['Fertility'])

y_train3 = train[['Fertility']]

#### Using Select K Best to select top 3 features

In [40]:
f_selector = SelectKBest(score_func=f_regression, k=3)
f_selector.fit(X_train3, y_train3)
mask = f_selector.get_support()
X_train3.columns[mask]

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [41]:
# using skbest function 
select_kbest(X_train3, y_train3, 3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

#### Using RFE to select top 3 features

In [42]:
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=3)
rfe.fit(X_train3, y_train3)
X_train3.columns[rfe.support_]

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')

In [47]:
# using rfe function
rfe(X_train3, y_train3, 3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')