In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from pydataset import data
import math

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

1. Load the tips dataset.

- a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.
- b. Create a column named price_per_person. This should be the total bill divided by the party size.
- c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?
- d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?
- e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?
- f. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

<br>

3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

<br>

4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).


_____________________________________

### #1 Load the tips dataset.

In [2]:
df= data('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 244 entries, 1 to 244
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 15.2+ KB


#### #1a. Create a column named tip_percentage. This should be the tip amount divided by the total bill.

In [5]:
df['tip_percentage'] = df['tip'] / df['total_bill']

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


#### #1b. Create a column named price_per_person. This should be the total bill divided by the party size.

In [7]:
df['price_per_person'] = df['total_bill'] / df['size'] 

In [8]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495
2,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
3,21.01,3.5,Male,No,Sun,Dinner,3,0.166587,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,0.13978,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.1475


________________________

**SPLIT DATA**

In [9]:
#split the data
train, test = train_test_split(df, train_size = 0.8, random_state = 123)
train, validate = train_test_split(train, train_size = 0.7, random_state = 123)

In [10]:
#check shape of each dataset
train.shape, validate.shape, test.shape

((136, 9), (59, 9), (49, 9))

In [11]:
#assign everything to X_train except tip and tip percentage
X_train = train.drop(columns=['tip', 'smoker', 'day', 'sex', 'time'])

#assign y_train 
y_train = train['tip']


In [12]:
X_train.head()

Unnamed: 0,total_bill,size,tip_percentage,price_per_person
19,16.97,3,0.206246,5.656667
173,7.25,2,0.710345,3.625
119,12.43,2,0.144811,6.215
29,21.7,2,0.198157,10.85
238,32.83,2,0.035638,16.415


__________

**SCALE DATA**

In [13]:
#assign to variable #need scaler for each independent variable
#create it
#scaler_tip = MinMaxScaler()

In [14]:
#fit it
#scaler_tip.fit(train[['tip']])

In [15]:
#use it
#make a new column within train
#use 'transform' instead of 'predict'
#train['tip_minmax'] = scaler_tip.transform(train[['tip']])

_____________

#### #1c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?

In [16]:
# I believe total_bill feature would be most important in predicting tip/tip percentage

In [17]:
df.corr()

Unnamed: 0,total_bill,tip,size,tip_percentage,price_per_person
total_bill,1.0,0.675734,0.598315,-0.338624,0.647497
tip,0.675734,1.0,0.489299,0.34237,0.347393
size,0.598315,0.489299,1.0,-0.14286,-0.175412
tip_percentage,-0.338624,0.34237,-0.14286,1.0,-0.314156
price_per_person,0.647497,0.347393,-0.175412,-0.314156,1.0


____________________________

#### #1d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?

**RFE**

In [18]:
lm = LinearRegression()

In [19]:
rfe = RFE(estimator=lm, n_features_to_select=2)

In [20]:
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [21]:
feature_mask_rank = rfe.ranking_
feature_mask_rank

array([2, 1, 1, 3])

In [22]:
pd.Series(dict(zip(X_train.columns, rfe.ranking_))).sort_values()

size                1
tip_percentage      1
total_bill          2
price_per_person    3
dtype: int64

**SelectKBest**

In [23]:
#create the model
f_selector = SelectKBest(score_func=f_regression, k=2)

In [24]:
#fit the model
f_selector.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x7fc01b436d30>)

In [25]:
mask = f_selector.get_support()
X_train.columns[mask]

Index(['total_bill', 'size'], dtype='object')

In [26]:
X_train_kbest = f_selector.transform(X_train)

model = LinearRegression().fit(X_train_kbest, y_train)

_____________

#### #1e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [27]:
#assign everything to X_train except tip and tip percentage
X_train2 = train[['total_bill', 'size', 'tip', 'price_per_person']]

#assign y_train 
y_train2 = train['tip_percentage']


In [28]:
#create the model
f_selector = SelectKBest(score_func=f_regression, k=2)
#fit the model
f_selector.fit(X_train2, y_train2)

SelectKBest(k=2, score_func=<function f_regression at 0x7fc01b436d30>)

In [29]:
mask = f_selector.get_support()
X_train2.columns[mask]

Index(['tip', 'price_per_person'], dtype='object')

In [30]:
pd.Series(dict(zip(X_train2.columns, mask))).sort_values()

total_bill          False
size                False
tip                  True
price_per_person     True
dtype: bool

**RFE**

In [31]:
lm = LinearRegression()

In [32]:
rfe = RFE(estimator=lm, n_features_to_select=2)

In [33]:
rfe.fit(X_train2, y_train2)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [34]:
feature_mask_rank = rfe.ranking_
feature_mask_rank

array([1, 2, 1, 3])

In [35]:
pd.Series(dict(zip(X_train2.columns, rfe.ranking_))).sort_values()

total_bill          1
tip                 1
size                2
price_per_person    3
dtype: int64

#### #1f.  Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

In [36]:
#RFE with 1 feature
lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=1)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=1)

In [37]:
feature_mask_rank = rfe.ranking_
feature_mask_rank

array([3, 2, 1, 4])

In [38]:
pd.Series(dict(zip(X_train.columns, rfe.ranking_))).sort_values()

tip_percentage      1
size                2
total_bill          3
price_per_person    4
dtype: int64

In [39]:
#SKB model with feature of 1
f_selector = SelectKBest(score_func=f_regression, k=1)
#fit the model
f_selector.fit(X_train, y_train)

SelectKBest(k=1, score_func=<function f_regression at 0x7fc01b436d30>)

In [40]:
mask = f_selector.get_support()
X_train.columns[mask]

Index(['total_bill'], dtype='object')

_________

<hr style="border:2px solid black"> </hr>

### #2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [43]:
#X- features, y- target, k-#of features
def select_kbest(X,y,k): 
    f_selector = SelectKBest(f_regression, k)
    f_selector.fit(X, y)
    k_features = X.columns[f_selector.get_support()]

    return k_features

In [44]:
select_kbest(X_train, y_train, 2)

Index(['total_bill', 'size'], dtype='object')

<hr style="border:2px solid black"> </hr>

### #3 Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [41]:
def rfe(X, y, n):
    lm = LinearRegression()
    rfe = RFE(lm, n)
    rfe.fit(X, y)
    
    n_features = X.columns[rfe.support_]
    
    return n_features

In [42]:
rfe(X_train, y_train, 2)

Index(['size', 'tip_percentage'], dtype='object')

<hr style="border:2px solid black"> </hr>

### #4 Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [45]:
import wrangle
import evaluate

In [46]:
df_swiss= data('swiss')

In [47]:
df_swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [48]:
df_swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [49]:
#use train, validate, test function in wrangle
train, validate, test= wrangle.split_data(df_swiss)

train -> (25, 6)
validate -> (12, 6)
test -> (10, 6)


In [50]:
#assign everything to X_train 
X_train = train.drop(columns=['Fertility'])

#assign y_train 
y_train = train['Fertility']

In [51]:
X_train.head()

Unnamed: 0,Agriculture,Examination,Education,Catholic,Infant.Mortality
Rolle,60.8,16,10,7.72,16.3
Lavaux,73.0,19,9,2.84,20.0
Nyone,50.9,22,12,15.14,16.7
Conthey,85.9,3,2,99.71,15.1
Yverdon,49.5,15,8,6.1,22.5


In [52]:
y_train.head()

Rolle      60.5
Lavaux     65.1
Nyone      56.6
Conthey    75.5
Yverdon    65.4
Name: Fertility, dtype: float64

In [53]:
#select K best model
select_kbest(X_train, y_train,3)

Index(['Examination', 'Catholic', 'Infant.Mortality'], dtype='object')

In [54]:
#RFE model
rfe(X_train, y_train, 3)

Index(['Agriculture', 'Examination', 'Infant.Mortality'], dtype='object')