In [1]:
import pandas as pd
import numpy as np
import wrangle, prepare, explore
import warnings
import pydataset as data
import seaborn as sns
warnings.filterwarnings("ignore")
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler

## 1. Load the tips dataset

In [2]:
tips_df = sns.load_dataset('tips')
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


#### a./b. Create a column for the tip_percentage and for the price_per_person

In [3]:
tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill']

In [4]:
tips_df['price_per_person'] = tips_df['total_bill'] / tips_df['size']

In [5]:
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_percentage,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447,8.495000
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542,3.446667
2,21.01,3.50,Male,No,Sun,Dinner,3,0.166587,7.003333
3,23.68,3.31,Male,No,Sun,Dinner,2,0.139780,11.840000
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808,6.147500
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,0.203927,9.676667
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0.073584,13.590000
241,22.67,2.00,Male,Yes,Sat,Dinner,2,0.088222,11.335000
242,17.82,1.75,Male,No,Sat,Dinner,2,0.098204,8.910000


#### c. Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount? The tip percentage?
> I'd assume its the total_bill and size. Tip_percentage is of course going to be predictive since you can calculate a tip using a percentage and the total bill. 

In [6]:
train, validate, test = prepare.train_validate_test_split(tips_df)


In [7]:
X_train = train[['total_bill','size','price_per_person','tip_percentage']]
y_train = train['tip']
X_validate = validate[['total_bill','size','price_per_person','tip_percentage']]
y_validate = validate['tip']
X_test= test[['total_bill','size','price_per_person','tip_percentage']]
y_test = test['tip']

#### d. Use all the other numeric features to predict tip amount. Use select k best and recursive feature elimination to select the top 2 features. What are they?


In [8]:
X = X_train
y = y_train
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(X, y)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'size']

In [9]:
X = tips_df[['total_bill','size','price_per_person','tip_percentage']]
y = tips_df['tip']

lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X, y)
mask = rfe.support_
rfe_features = X.loc[:,mask].columns.tolist()
rfe_features

['total_bill', 'tip_percentage']

#### e. Use all the other numeric features to predict tip percentage. Use select k best and recursive feature elimination to select the top 2 features. What are they?

In [10]:
X_train = train[['total_bill','size','price_per_person','tip_percentage']]
y_train = train['tip_percentage']
X_validate = validate[['total_bill','size','price_per_person','tip_percentage']]
y_validate = validate['tip_percentage']
X_test= test[['total_bill','size','price_per_person','tip']]
y_test = test['tip_percentage']

In [11]:
X = tips_df[['total_bill','size','price_per_person','tip']]
y = tips_df['tip_percentage']
f_selector = SelectKBest(f_regression, k=2)
f_selector.fit(X, y)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

# get list of top K features. 
f_feature = X.iloc[:,feature_mask].columns.tolist()
f_feature

['total_bill', 'tip']

In [12]:
X = tips_df[['total_bill','size','price_per_person','tip']]
y = tips_df['tip_percentage']

lm = LinearRegression()
rfe = RFE(lm, 2)
X_rfe = rfe.fit_transform(X, y)
mask = rfe.support_
rfe_features = X.loc[:,mask].columns.tolist()
rfe_features

['size', 'tip']

## 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [13]:
def select_kbest(X, y, stats = f_regression, k = 2):
    X_best= SelectKBest(stats, k).fit(X, y)
    mask = X_best.get_support() #list of booleans for selected features
    new_feat = [] 
    for bool, feature in zip(mask, X.columns):
        if bool:
            new_feat.append(feature)
    return print('The best features are:{}'.format(new_feat))

In [14]:
select_kbest(X,y)

The best features are:['total_bill', 'tip']


## 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [15]:
def rfe(X,y, k = 2, rankings = False):
    lm = LinearRegression()
    rfe = RFE(lm, k)
    X_rfe = rfe.fit_transform(X, y)
    mask = rfe.get_support()
    new_feat = []
    for bool, feature in zip(mask, X.columns):
        if bool:
            new_feat.append(feature)
    if rankings:
        rankings = pd.Series(dict(zip(X.columns, rfe.ranking_)))
        return rankings
    else:
        return print(f'Best features are {new_feat}')

In [16]:
rfe(X,y, rankings = False)

Best features are ['size', 'tip']


## 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [17]:
swiss_df = data.data('swiss')
swiss_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Fertility,47.0,70.142553,12.491697,35.0,64.7,70.4,78.45,92.5
Agriculture,47.0,50.659574,22.711218,1.2,35.9,54.1,67.65,89.7
Examination,47.0,16.489362,7.977883,3.0,12.0,16.0,22.0,37.0
Education,47.0,10.978723,9.615407,1.0,6.0,8.0,12.0,53.0
Catholic,47.0,41.14383,41.70485,2.15,5.195,15.14,93.125,100.0
Infant.Mortality,47.0,19.942553,2.912697,10.8,18.15,20.0,21.7,26.6


In [18]:
swiss_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [19]:
train, test, validate = prepare.train_validate_test_split(swiss_df)

In [20]:
X_train = train.drop(columns = 'Fertility')
y_train = train['Fertility']
X_validate = validate.drop(columns = 'Fertility')
y_validate = validate['Fertility']
X_test = test.drop(columns = 'Fertility')
y_test = test['Fertility']

In [21]:
def minmax_scale(data_set, fit = X_train):
    '''
    Takes in the dataframe and applies a minmax scaler to it. Can pass a dataframe slice, 
    needs to be numbers. Outputs a scaled dataframe.  
    '''
    scaler = MinMaxScaler().fit(fit)
    x_scaled = scaler.transform(data_set)
    x_scaled = pd.DataFrame(x_scaled)
    x_scaled.columns = data_set.columns
    return x_scaled

In [28]:
X_train_scaled = prepare.minmax_scale(X_train, X_train)

In [29]:
X = X_train_scaled
y = y_train

In [30]:
explore.select_kbest(X,y, k = 3)

The best features are:['Examination', 'Catholic', 'Infant.Mortality']


In [31]:
rfe(X,y, k = 3)

Best features are ['Agriculture', 'Examination', 'Infant.Mortality']
