In [71]:
import pandas as pd
import numpy as np
from scipy import stats
from pydataset import data

#viz and stats
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr, spearmanr

#import sklearn mean_square_error
from sklearn.metrics import mean_squared_error

#import r2_score from sklearn
from sklearn.metrics import r2_score

#my wrangle file 
import wrangle
#split data
from sklearn.model_selection import train_test_split
import evaluate
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

import env

## Exercises
### Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

### 1. Load the tips dataset.

In [72]:
#get tips data
df = data('tips')

In [73]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [74]:
#look at shape
df.shape

(244, 7)

### a) Create a column named price_per_person. This should be the total bill divided by the party size.

In [75]:
#make dummy variables for sex, smoker, and time
df = pd.get_dummies(df, columns = ['sex','smoker','time'],drop_first=True)


In [76]:
df

Unnamed: 0,total_bill,tip,day,size,sex_Male,smoker_Yes,time_Lunch
1,16.99,1.01,Sun,2,0,0,0
2,10.34,1.66,Sun,3,1,0,0
3,21.01,3.50,Sun,3,1,0,0
4,23.68,3.31,Sun,2,1,0,0
5,24.59,3.61,Sun,4,0,0,0
...,...,...,...,...,...,...,...
240,29.03,5.92,Sat,3,1,0,0
241,27.18,2.00,Sat,2,0,1,0
242,22.67,2.00,Sat,2,1,1,0
243,17.82,1.75,Sat,2,1,0,0


In [77]:
# rearange the days & convert day to number
df.day = df['day'].astype('category')


In [78]:
df.day = df.day.cat.reorder_categories(['Thur','Fri','Sat','Sun'])
df.day

1       Sun
2       Sun
3       Sun
4       Sun
5       Sun
       ... 
240     Sat
241     Sat
242     Sat
243     Sat
244    Thur
Name: day, Length: 244, dtype: category
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [79]:
df.day = df.day.cat.codes
df.day

1      3
2      3
3      3
4      3
5      3
      ..
240    2
241    2
242    2
243    2
244    0
Name: day, Length: 244, dtype: int8

In [80]:
#create a column named price per person
df['price_per_person'] = df.total_bill/df['size']

In [81]:
df['price_per_person'] 

1       8.495000
2       3.446667
3       7.003333
4      11.840000
5       6.147500
         ...    
240     9.676667
241    13.590000
242    11.335000
243     8.910000
244     9.390000
Name: price_per_person, Length: 244, dtype: float64

In [82]:
df

Unnamed: 0,total_bill,tip,day,size,sex_Male,smoker_Yes,time_Lunch,price_per_person
1,16.99,1.01,3,2,0,0,0,8.495000
2,10.34,1.66,3,3,1,0,0,3.446667
3,21.01,3.50,3,3,1,0,0,7.003333
4,23.68,3.31,3,2,1,0,0,11.840000
5,24.59,3.61,3,4,0,0,0,6.147500
...,...,...,...,...,...,...,...,...
240,29.03,5.92,2,3,1,0,0,9.676667
241,27.18,2.00,2,2,0,1,0,13.590000
242,22.67,2.00,2,2,1,1,0,11.335000
243,17.82,1.75,2,2,1,0,0,8.910000


In [83]:
#split data
train_validate, test = train_test_split(df, test_size=.2, random_state=123)
train, validate = train_test_split(train_validate, test_size=.3, random_state=123)

#make a list dodging the target
not_tip = df.columns.drop("tip").tolist()

#create the X and y sets
X_train,y_train = train[not_tip],train["tip"]
X_validate,y_validate = validate[not_tip],validate["tip"]
X_test,y_test = test[not_tip],test["tip"]


In [84]:
#check shape
train.shape

(136, 8)

In [85]:
#check tips info
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 19 to 167
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        136 non-null    float64
 1   tip               136 non-null    float64
 2   day               136 non-null    int8   
 3   size              136 non-null    int64  
 4   sex_Male          136 non-null    uint8  
 5   smoker_Yes        136 non-null    uint8  
 6   time_Lunch        136 non-null    uint8  
 7   price_per_person  136 non-null    float64
dtypes: float64(3), int64(1), int8(1), uint8(3)
memory usage: 5.8 KB


In [86]:
#check train head
train.head(5)

Unnamed: 0,total_bill,tip,day,size,sex_Male,smoker_Yes,time_Lunch,price_per_person
19,16.97,3.5,3,3,0,0,0,5.656667
173,7.25,5.15,3,2,1,1,0,3.625
119,12.43,1.8,0,2,0,0,1,6.215
29,21.7,4.3,2,2,1,0,0,10.85
238,32.83,1.17,2,2,1,1,0,16.415


### b) Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?


In [87]:
# total bill
# size

### c) Use select k best to select the top 2 features for predicting tip amount. What are they?

In [88]:
# make the thing
kbest = SelectKBest(f_regression, k=2)
# fit the thing
_ = kbest.fit(X_train, y_train)


In [89]:
# statistical f-value:
kbest.scores_
#p value: 
kbest.pvalues_

array([7.18647033e-20, 1.04585545e-01, 1.34164190e-12, 2.84479443e-01,
       5.57997755e-01, 1.82144884e-01, 1.31032690e-03])

In [90]:
kbest_results = pd.DataFrame(
    dict(p=kbest.pvalues_, f=kbest.scores_),
                             index = X_train.columns)

In [91]:
kbest_results

Unnamed: 0,p,f
total_bill,7.18647e-20,115.984909
day,0.1045855,2.670276
size,1.341642e-12,61.259089
sex_Male,0.2844794,1.154792
smoker_Yes,0.5579978,0.344909
time_Lunch,0.1821449,1.798647
price_per_person,0.001310327,10.777792


In [92]:
# total bill and size are the top 2 features for predicting tip amount

### d) Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [93]:
from sklearn.linear_model import LinearRegression

In [94]:
# make a model object to use in RFE process.
# The model is here to give us metrics on feature importance and model score
# allowing us to recursively reduce the number of features to reach our desired space
model = LinearRegression()

In [95]:
# make thing
rfe = RFE(model, n_features_to_select=2)
# fit thing
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [96]:
rfe.ranking_

array([1, 5, 3, 1, 6, 4, 2])

In [97]:
pd.DataFrame(
{
    'rfe_ranking': rfe.ranking_
},index = X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,1
day,5
size,3
sex_Male,1
smoker_Yes,6
time_Lunch,4
price_per_person,2


In [98]:
rfe.get_support()

array([ True, False, False,  True, False, False, False])

In [99]:
#mask the boolean to get the columns
X_train.columns[rfe.get_support()]

Index(['total_bill', 'sex_Male'], dtype='object')

In [100]:
X_train_transformed = pd.DataFrame(
rfe.transform(X_train),
index = X_train.index,
columns = X_train.columns[rfe.support_])

In [101]:
X_train_transformed.head()

Unnamed: 0,total_bill,sex_Male
19,16.97,0.0
173,7.25,1.0
119,12.43,0.0
29,21.7,1.0
238,32.83,1.0


In [102]:
#top 2 features for tip amount here are total bill & sex male

### e) Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

In [None]:
#It's because the feature selection changes as the number of features changes.

### 2. Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.



In [103]:
def select_kbest(X, y, k=2):
    '''
    will take in two pandas objects:
    X: a dataframe representing numerical independent features
    y: a pandas Series representing a target variable
    k: a keyword argument defaulted to 2 for the number of ideal features we elect to select
    
    return: a list of the selected features from the SelectKBest process
    '''
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]
    

### 3. Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [104]:
def rfe(X, y, k=2):
    '''
    will take in two pandas objects:
    X: a dataframe representing numerical independent features
    y: a pandas Series representing a target variable
    k: a keyword argument defaulted to 2 for the number of ideal features we elect to select
    
    return: a list of the selected features from the recursive feature elimination process
    '''
    rf = RFE(LinearRegression(), n_features_to_select=k)
    rf.fit(X, y)
    mask = rf.get_support()
    return X.columns[mask]


### 4. Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [105]:
#acquire swiss database
df = data('swiss')

In [106]:
df

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6
Porrentruy,76.1,35.3,9,7,90.57,26.6
Broye,83.8,70.2,16,7,92.85,23.6
Glane,92.4,67.8,14,8,97.16,24.9
Gruyere,82.4,53.3,12,7,97.67,21.0
Sarine,82.9,45.2,16,13,91.38,24.4


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 2.6+ KB


In [112]:
#drop infant mortality to find the top 3 features to predict Fertility
X = df.drop(columns='Infant.Mortality')
y = df['Infant.Mortality']

In [113]:
rfe(X, y, k=3)

Index(['Fertility', 'Examination', 'Education'], dtype='object')

In [114]:
data('swiss', show_doc=True)

swiss

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Swiss Fertility and Socioeconomic Indicators (1888) Data

### Description

Standardized fertility measure and socio-economic indicators for each of 47
French-speaking provinces of Switzerland at about 1888.

### Usage

    data(swiss)

### Format

A data frame with 47 observations on 6 variables, each of which is in percent,
i.e., in [0,100].

[,1] Fertility Ig, "common standardized fertility measure" [,2] Agriculture
[,3] Examination nation [,4] Education [,5] Catholic [,6] Infant.Mortality
live births who live less than 1 year.

All variables but 'Fert' give proportions of the population.

### Source

Project "16P5", pages 549-551 in

Mosteller, F. and Tukey, J. W. (1977) “Data Analysis and Regression: A Second
Course in Statistics”. Addison-Wesley, Reading Mass.

indicating their source as "Data used by permission of Franice van de Walle.
Office of Population Research, Princeton Univer

In [115]:
select_kbest(X, y, k=3)

Index(['Fertility', 'Examination', 'Catholic'], dtype='object')