In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import wrangle as w
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
import warnings
warnings.filterwarnings("ignore")
import time
import explore_regression as e
import seaborn as sns
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

Exercises

Do your work for this exercise in a jupyter notebook named feature_engineering within the regression-exercises repo. Add, commit, and push your work.

Load the tips dataset.

Create a column named price_per_person. This should be the total bill divided by the party size.
Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?
Use Select K Best to select the top 2 features for predicting tip amount. What are they?
Use Recursive Feature Elimination to select the top 2 features for tip amount. What are they?
Why do you think Select K Best and Recursive Feature Elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?
Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [2]:
import seaborn as sns
data=sns.load_dataset("tips")
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
df = pd.DataFrame(data)

In [4]:
df.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [5]:
# Create the 'price_per_person' column
df['price_per_person'] = round(df['total_bill'] / df['size'],2)

In [6]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15


In [7]:
df.sex.value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [8]:
df = pd.get_dummies(df, columns=['time', 'sex', 'day', 'smoker']).astype(int)

In [9]:
df.head()

Unnamed: 0,total_bill,tip,size,price_per_person,time_Lunch,time_Dinner,sex_Male,sex_Female,day_Thur,day_Fri,day_Sat,day_Sun,smoker_Yes,smoker_No
0,16,1,2,8,0,1,0,1,0,0,0,1,0,1
1,10,1,3,3,0,1,1,0,0,0,0,1,0,1
2,21,3,3,7,0,1,1,0,0,0,0,1,0,1
3,23,3,2,11,0,1,1,0,0,0,0,1,0,1
4,24,3,4,6,0,1,0,1,0,0,0,1,0,1


In [10]:
def X_y_split(df, target):
    train, val, test = split_data(df)
    X_train = train.drop(columns=target)
    y_train = train[target]
    X_val = val.drop(columns=target)
    y_val = val[target]
    X_test = test.drop(columns=target)
    y_test = test[target]
    print(f'X_train --> {X_train.shape}')
    print(f'X_val --> {X_val.shape}')
    print(f'X_test --> {X_test.shape}')
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [9]:
df.smoker.value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [10]:
# Define a mapping dictionary
time_mapping = {'Lunch': 0, 'Dinner': 1}
sex_mapping = {'Male': 0, 'Female': 1}
day_mapping = {'Thur': 0, 'Fri': 1, 'Sat': 2, 'Sun':3}
smoker_mapping = {'No': 0, 'Yes': 1}
# Replace 'time' column with mapped integers
df['time'] = df['time'].map(time_mapping)
df['sex'] = df['sex'].map(sex_mapping)
df['day'] = df['day'].map(day_mapping)
df['smoker'] = df['smoker'].map(smoker_mapping)

In [11]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
0,16.99,1.01,1,0,3,1,2,8.49
1,10.34,1.66,0,0,3,1,3,3.45
2,21.01,3.50,0,0,3,1,3,7.00
3,23.68,3.31,0,0,3,1,2,11.84
4,24.59,3.61,1,0,3,1,4,6.15
...,...,...,...,...,...,...,...,...
239,29.03,5.92,0,0,2,1,3,9.68
240,27.18,2.00,1,1,2,1,2,13.59
241,22.67,2.00,0,1,2,1,2,11.34
242,17.82,1.75,0,0,2,1,2,8.91


In [12]:
train, val, test = w.splitting_data(df)

In [13]:
train.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
97,12.03,1.5,0,1,1,1,2,6.02
116,29.93,5.07,0,0,3,1,4,7.48
3,23.68,3.31,0,0,3,1,2,11.84
163,13.81,2.0,0,0,3,1,2,6.9
232,11.61,3.39,0,0,2,1,2,5.8


In [14]:
kbest_0 = SelectKBest(f_regression, k=2)

In [15]:
kbest_0

In [16]:
train.dtypes

total_bill           float64
tip                  float64
sex                 category
smoker              category
day                 category
time                category
size                   int64
price_per_person     float64
dtype: object

In [17]:
X_train, y_train = train.drop(columns='tip'), train.tip

In [18]:
train.time.value_counts()

time
1    109
0     37
Name: count, dtype: int64

In [19]:
# fit the object that we just created:
kbest_0.fit(X_train, y_train)

In [20]:
kbest_0.scores_

array([1.34668687e+02, 1.52013078e+00, 1.23381880e-02, 2.48080804e+00,
       2.98486244e+00, 4.55437858e+01, 2.25674325e+01])

In [21]:
kbest_0.transform(X_train)[:5]

array([[11.02,  2.  ],
       [18.35,  4.  ],
       [18.04,  2.  ],
       [16.66,  2.  ],
       [ 7.25,  2.  ]])

In [22]:
kbest_0.get_feature_names_out()

array(['total_bill', 'size'], dtype=object)

In [24]:
# make a model object for our wrapper:
model = LinearRegression()

In [40]:
rfe = RFE(model, n_features_to_select=2)

In [41]:
rfe

In [42]:
rfe.fit(X_train, y_train)

In [43]:
rfe.ranking_

array([2, 4, 1, 6, 5, 1, 3])

In [44]:
ranking = pd.DataFrame(
{
    'feature': X_train.columns.to_list(),
    'rfe_ranking': rfe.ranking_
    
})

# Sorting the DataFrame by 'rfe_ranking'
sorted_ranking_df = ranking.sort_values(by='rfe_ranking')

# Displaying the sorted DataFrame
print(sorted_ranking_df)

            feature  rfe_ranking
2            smoker            1
5              size            1
0        total_bill            2
6  price_per_person            3
1               sex            4
4              time            5
3               day            6


In [45]:
# make a new rfe object
rfe_1 = RFE(model, n_features_to_select=1)
# fit the rfe object
rfe_1.fit(X_train, y_train)
pd.DataFrame(
{
    'feature': X_train.columns.to_list(),
    'rfe_1_ranking': rfe_1.ranking_
    
})

Unnamed: 0,feature,rfe_1_ranking
0,total_bill,3
1,sex,5
2,smoker,2
3,day,7
4,time,6
5,size,1
6,price_per_person,4


In [31]:
X_train.shape

(146, 7)

Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [78]:
def select_kbest(X,y,k):
    kbest_0 = SelectKBest(score_func=f_regression, k=k)
    
    kbest_0.fit(X, y)
    
    best_features = kbest_0.get_feature_names_out()
    
    return best_features

In [79]:
def select_kbest(X, y, k):
    kbest_0 = SelectKBest(score_func=f_regression, k=k)
    kbest_0.fit(X, y)
    
    # Get scores and indices of the top k features
    scores = kbest_0.scores_
    top_indices = kbest_0.get_support(indices=True)
    
    # Sort scores and indices based on scores
    top_scores = scores[top_indices]
    sorted_indices = list(reversed(sorted(range(len(top_scores)), key=lambda i: top_scores[i])))
    sorted_scores = [top_scores[i] for i in sorted_indices]
    sorted_features = [X.columns[i] for i in top_indices[sorted_indices]]
    
    # Create a DataFrame with feature names and their scores and rankings
    feature_ranking = pd.DataFrame({'feature': sorted_features, 'score': sorted_scores})
    feature_ranking['kbest_ranking'] = feature_ranking['score'].rank(ascending=False).astype(int)
    
    # Return the top k features
    best_features = feature_ranking['feature'].tolist()
    return feature_ranking

In [76]:
select_kbest(X_train_swiss, y_train_swiss, 5)

Unnamed: 0,feature,score,kbest_ranking
0,Education,26.044126,1
1,Examination,20.200522,2
2,Infant.Mortality,7.546899,3
3,Catholic,5.898451,4
4,Agriculture,5.769926,5


In [49]:
select_kbest(X_train, y_train, 2)

array(['total_bill', 'size'], dtype=object)

Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top n features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [56]:
def rfe(X,y,n):
    model = LinearRegression()
    
    rfe = RFE(model, n_features_to_select=n)
    
    # fit the rfe object
    rfe.fit(X, y)
    
    #place the rfe rankings into a dataframe for easy to read 
    df = pd.DataFrame(
    {
    'feature': X.columns.to_list(),
    'rfe_ranking': rfe.ranking_   
    })
    
    #give us the data frame back
    return df

In [55]:
rfe(X_train, y_train, 2)

Unnamed: 0,feature,rfe_ranking
0,total_bill,2
1,sex,4
2,smoker,1
3,day,6
4,time,5
5,size,1
6,price_per_person,3


Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both Select K Best and Recursive Feature Elimination (use the functions you just built to help you out).

In [59]:
from pydataset import data
swiss = data("swiss")

In [60]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [61]:
swiss.dtypes

Fertility           float64
Agriculture         float64
Examination           int64
Education             int64
Catholic            float64
Infant.Mortality    float64
dtype: object

In [63]:
train_swiss, val_swiss, test_swiss = w.splitting_data(swiss)

In [66]:
X_train_swiss, y_train_swiss = train_swiss.drop(columns='Fertility'), train_swiss.Fertility

In [69]:
select_kbest(X_train_swiss, y_train_swiss, 3)

array(['Examination', 'Education', 'Infant.Mortality'], dtype=object)

In [70]:
rfe(X_train_swiss, y_train_swiss, 3)

Unnamed: 0,feature,rfe_ranking
0,Agriculture,2
1,Examination,1
2,Education,1
3,Catholic,3
4,Infant.Mortality,1
