In [1]:
import pandas as pd
import numpy as np 

import seaborn as sns
import matplotlib.pyplot as plt

from pydataset import data
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_regression, SelectKBest, RFE 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

## 1. Load the tips dataset.

 **a. Create a column named price_per_person. This should be the total bill divided by the party size.<br>**

In [2]:
tips = sns.load_dataset('tips')

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
tips['price_per_person'] = tips['total_bill']/tips['size']

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.495
1,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
2,21.01,3.5,Male,No,Sun,Dinner,3,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.1475


### **b. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?<br>**
 

In [4]:
tips.corr()

Unnamed: 0,total_bill,tip,size,price_per_person
total_bill,1.0,0.675734,0.598315,0.647497
tip,0.675734,1.0,0.489299,0.347393
size,0.598315,0.489299,1.0,-0.175412
price_per_person,0.647497,0.347393,-0.175412,1.0


*I would say total_bill will be the most important for prediciting the tip amount*

In [5]:
train, val_test = train_test_split(tips, train_size=0.6, random_state=22)
    
val, test = train_test_split(val_test, train_size=0.5, random_state=22)

train.shape, val.shape, test.shape

((146, 8), (49, 8), (49, 8))

In [6]:
mms = MinMaxScaler()

train[['total_bill', 'size', 'price_per_person']] = mms.fit_transform(train[['total_bill', 'size', 'price_per_person']])

train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
173,0.635882,3.18,Male,Yes,Sun,Dinner,0.2,0.961326
24,0.370084,3.18,Male,No,Sat,Dinner,0.2,0.518232
241,0.433053,2.0,Male,Yes,Sat,Dinner,0.2,0.623204
66,0.295625,2.47,Female,No,Sat,Dinner,0.2,0.394107
164,0.319046,3.0,Female,Yes,Sun,Dinner,0.2,0.433149


### **c. Use select k best to select the top 2 features for predicting tip amount. What are they?<br>**
 

In [7]:
X_train_scaled = train[['total_bill', 'size', 'price_per_person']]
y_train = train['tip']

In [8]:
f_selector = SelectKBest(f_regression, k = 2)

In [9]:
f_selector.fit(X_train_scaled, y_train)

In [10]:
f_select_mask = f_selector.get_support()
f_select_mask

array([ True,  True, False])

In [11]:
X_train_scaled.columns[f_select_mask].tolist()

['total_bill', 'size']

In [12]:
X_train_scaled.iloc[:,f_select_mask]

Unnamed: 0,total_bill,size
173,0.635882,0.2
24,0.370084,0.2
241,0.433053,0.2
66,0.295625,0.2
164,0.319046,0.2
107,0.489174,0.2
91,0.429076,0.2
138,0.285683,0.2
1,0.160627,0.4
39,0.623067,0.4


In [13]:
f_selector.transform(X_train_scaled)

array([[0.63588157, 0.2       ],
       [0.37008396, 0.2       ],
       [0.43305347, 0.2       ],
       [0.29562528, 0.2       ],
       [0.31904551, 0.2       ],
       [0.48917366, 0.2       ],
       [0.42907645, 0.2       ],
       [0.28568272, 0.2       ],
       [0.16062749, 0.4       ],
       [0.62306673, 0.4       ],
       [0.28457799, 0.2       ],
       [0.05921343, 0.2       ],
       [0.28060097, 0.2       ],
       [0.35373398, 0.2       ],
       [0.36986301, 0.2       ],
       [0.16062749, 0.2       ],
       [0.        , 0.        ],
       [0.25872735, 0.2       ],
       [0.21409633, 0.2       ],
       [0.32081308, 0.4       ],
       [0.32589483, 0.2       ],
       [0.46376491, 0.4       ],
       [0.46266019, 0.6       ],
       [0.39637649, 0.2       ],
       [0.1979673 , 0.2       ],
       [0.39438798, 0.2       ],
       [0.99867433, 0.6       ],
       [0.71122404, 0.6       ],
       [0.21188688, 0.2       ],
       [0.69774635, 0.6       ],
       [0.

***Using the Select K best we get total bill and size as the top 2 features for predicitng tip amount.***

### **d. Use recursive feature elimination to select the top 2 features for tip amount. What are they?<br>**
 

In [14]:
train.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
173,0.635882,3.18,Male,Yes,Sun,Dinner,0.2,0.961326
24,0.370084,3.18,Male,No,Sat,Dinner,0.2,0.518232
241,0.433053,2.0,Male,Yes,Sat,Dinner,0.2,0.623204
66,0.295625,2.47,Female,No,Sat,Dinner,0.2,0.394107
164,0.319046,3.0,Female,Yes,Sun,Dinner,0.2,0.433149


In [15]:
X_train = train.drop(columns = ['tip'])
X_train.head(3)

Unnamed: 0,total_bill,sex,smoker,day,time,size,price_per_person
173,0.635882,Male,Yes,Sun,Dinner,0.2,0.961326
24,0.370084,Male,No,Sat,Dinner,0.2,0.518232
241,0.433053,Male,Yes,Sat,Dinner,0.2,0.623204


In [49]:
X_train_dum = pd.get_dummies(X_train, columns = ['sex', 'smoker', 'day', 'time'])

X_train_dum.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
173,0.635882,0.2,0.961326,1,0,1,0,0,0,0,1,0,1
24,0.370084,0.2,0.518232,1,0,0,1,0,0,1,0,0,1
241,0.433053,0.2,0.623204,1,0,1,0,0,0,1,0,0,1
66,0.295625,0.2,0.394107,0,1,0,1,0,0,1,0,0,1
164,0.319046,0.2,0.433149,0,1,1,0,0,0,0,1,0,1


In [50]:
lm = LinearRegression()

rfe = RFE(lm, n_features_to_select = 2)

In [51]:
rfe.fit(X_train_dum, y_train)

In [52]:
ranks = rfe.ranking_

columns = X_train_dum.columns.tolist()

In [53]:
feature_ranks = pd.DataFrame({'ranking': ranks,
                             'feature': columns})

In [54]:
feature_ranks.sort_values('ranking').reset_index().drop(columns = ('index'))

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
2,2,size
3,3,day_Fri
4,4,smoker_No
5,5,smoker_Yes
6,6,time_Lunch
7,7,day_Thur
8,8,day_Sat
9,9,day_Sun


***Using the recursive feature elimination we get total bill and price_per_person as the top 2 features for predicitng tip amount.***

### **e. Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?**

***I think it could be from the amount of different features you select to run the tests on. And yes it does change as you change the number of features you are selecting***

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [31]:
def select_kbest(x,y,k):
    
    f_selector = SelectKBest(f_regression, k = k)
    
    f_selector.fit(x, y)   
    
    f_select_mask = f_selector.get_support()

    f_selector.transform(x)
    
    
    return x.iloc[:,f_select_mask]

In [32]:
select_kbest(X_train_scaled, y_train, 2)

Unnamed: 0,total_bill,size
173,0.635882,0.2
24,0.370084,0.2
241,0.433053,0.2
66,0.295625,0.2
164,0.319046,0.2
107,0.489174,0.2
91,0.429076,0.2
138,0.285683,0.2
1,0.160627,0.4
39,0.623067,0.4


## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [63]:
def rfe(x, y, k):
    
    lm = LinearRegression()

    rfe = RFE(lm, n_features_to_select = k)
    
    rfe.fit(x, y)
    
    ranks = rfe.ranking_

    columns = x.columns.tolist()
    
    feature_ranks = pd.DataFrame({'ranking': ranks,
                                 'feature': columns})
    
    return feature_ranks.sort_values('ranking').reset_index().drop(columns = ('index'))

In [64]:
rfe(X_train_dum, y_train, 2)

Unnamed: 0,ranking,feature
0,1,total_bill
1,1,price_per_person
2,2,size
3,3,day_Fri
4,4,smoker_No
5,5,smoker_Yes
6,6,time_Lunch
7,7,day_Thur
8,8,day_Sat
9,9,day_Sun


## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [55]:
swiss = data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [56]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [57]:
train, val_test = train_test_split(swiss, train_size=0.6, random_state=22)
    
val, test = train_test_split(val_test, train_size=0.5, random_state=22)

train.shape, val.shape, test.shape

((28, 6), (9, 6), (10, 6))

In [58]:
mms = MinMaxScaler()

train[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']] = mms.fit_transform(train[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']])

train.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Glane,92.4,0.752542,0.323529,0.134615,0.970976,0.892405
Paysd'enhaut,72.0,0.703955,0.088235,0.038462,0.00419,0.455696
Nyone,56.6,0.561582,0.558824,0.211538,0.132754,0.373418
Lavaux,65.1,0.811299,0.470588,0.153846,0.007052,0.582278
Broye,83.8,0.779661,0.382353,0.115385,0.926929,0.810127


In [40]:
X_train_scaled_swiss = train[['Agriculture', 'Examination', 'Education', 'Catholic', 'Infant.Mortality']]
y_train_swiss = train['Fertility']

In [59]:
select_kbest(X_train_scaled_swiss, y_train_swiss, 3)

Unnamed: 0,Examination,Education,Catholic
Glane,0.323529,0.134615,0.970976
Paysd'enhaut,0.088235,0.038462,0.00419
Nyone,0.558824,0.211538,0.132754
Lavaux,0.470588,0.153846,0.007052
Broye,0.382353,0.115385,0.926929
Lausanne,0.676471,0.519231,0.101788
Val de Ruz,0.352941,0.115385,0.02882
Franches-Mnt,0.058824,0.076923,0.93255
Neuchatel,0.941176,0.596154,0.150945
Rolle,0.382353,0.173077,0.056924


In [65]:
rfe(X_train_scaled_swiss, y_train_swiss, 3)

Unnamed: 0,ranking,feature
0,1,Examination
1,1,Education
2,1,Infant.Mortality
3,2,Agriculture
4,3,Catholic
