In [113]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from pydataset import data

#import warnings
#warnings.filterwarnings("ignore")

# 1.

In [118]:
df = data('tips')

# 1 A.

In [119]:
df.insert(7, "price_per_person",(df.total_bill/df.size), True)

# 1. B.

In [120]:
df.corr()

Unnamed: 0,total_bill,tip,size,price_per_person
total_bill,1.0,0.675734,0.598315,1.0
tip,0.675734,1.0,0.489299,0.675734
size,0.598315,0.489299,1.0,0.598315
price_per_person,1.0,0.675734,0.598315,1.0


> ### Size seems most important for predicting `tip` amount

# 1. C. 

In [121]:
def train_val_test(df, target=None, stratify=None, seed=42):
    from sklearn.model_selection import train_test_split

    '''Split data into train, validate, and test subsets with 60/20/20 ratio'''
    
    train, val_test = train_test_split(df, train_size=0.6, random_state=seed)
    
    val, test = train_test_split(val_test, train_size=0.5, random_state=seed)
    
    return train, val, test

In [122]:
train, val, test = train_val_test(df,stratify='tip')

In [123]:
# Useful Variables
categorical_variables = ['sex','smoker','day','time']
continous_variables = ['total_bill','size','price_per_person']
target = ['tip']

#drop = categorical_variables+target

In [124]:
# Isolate Target
X_train_scaled = pd.DataFrame(train[continous_variables])
y_train = train[target]

In [125]:
# Scale Data
mms = MinMaxScaler()

In [126]:
X_train_scaled[continous_variables] = mms.fit_transform(X_train_scaled)
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person
195,0.240346,0.2,0.240346
77,0.270084,0.2,0.270084
42,0.259876,0.2,0.259876
109,0.277186,0.2,0.277186
224,0.227031,0.4,0.227031


In [127]:
f_selector = SelectKBest(f_regression, k=2)

f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

  y = column_or_1d(y, warn=True)


In [128]:
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

# Answer

In [129]:
f_feature

['total_bill', 'price_per_person']

# 1. D.

In [130]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person
195,0.240346,0.2,0.240346
77,0.270084,0.2,0.270084
42,0.259876,0.2,0.259876
109,0.277186,0.2,0.277186
224,0.227031,0.4,0.227031


In [131]:
X_train_scaled = pd.concat([X_train_scaled, train[categorical_variables]], axis=1)

In [132]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person,sex,smoker,day,time
195,0.240346,0.2,0.240346,Male,Yes,Thur,Lunch
77,0.270084,0.2,0.270084,Male,Yes,Sat,Dinner
42,0.259876,0.2,0.259876,Male,No,Sun,Dinner
109,0.277186,0.2,0.277186,Male,No,Sat,Dinner
224,0.227031,0.4,0.227031,Female,No,Fri,Lunch


In [133]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns = categorical_variables)

In [134]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
195,0.240346,0.2,0.240346,0,1,0,1,0,0,0,1,0,1
77,0.270084,0.2,0.270084,0,1,0,1,0,1,0,0,1,0
42,0.259876,0.2,0.259876,0,1,1,0,0,0,1,0,1,0
109,0.277186,0.2,0.277186,0,1,1,0,0,1,0,0,1,0
224,0.227031,0.4,0.227031,1,0,1,0,1,0,0,0,0,1


In [136]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_train_scaled, y_train)

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()
rfe_feature

['total_bill', 'smoker_No']

In [137]:
var_ranks = rfe.ranking_

# get the variable names
var_names = X_train_scaled.columns.tolist()

# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

# sort the df by rank
rfe_ranks_df.sort_values('Rank').head(2)

Unnamed: 0,Var,Rank
0,total_bill,1
5,smoker_No,1


# Answer

In [138]:
rfe_ranks_df.sort_values('Rank').head(2)

Unnamed: 0,Var,Rank
0,total_bill,1
5,smoker_No,1


# 1. E.

In [None]:
# Change # of Features

# 2.

In [139]:
def select_kbest(X, y, k=2):
    f_selector = SelectKBest(f_regression, k = k)
    f_selector.fit(X, y)
    mask = f_selector.get_support()
    
    return X.columns[mask].tolist()

In [140]:
X = pd.DataFrame(train[continous_variables])
y = train[target]

select_kbest(X,y)

  y = column_or_1d(y, warn=True)


['total_bill', 'price_per_person']

# 3. 

In [141]:
def rfe(X, y, k=2):
    rfe = RFE(LinearRegression(), n_features_to_select = k)
    rfe.fit(X, y)
    feature_mask = rfe.support_
    
    return X.iloc[:,feature_mask].columns.tolist()

In [107]:
X = pd.DataFrame(train[continous_variables])
X = pd.concat([X, train[categorical_variables]], axis=1)
X = pd.get_dummies(X, columns = categorical_variables)

rfe(X,y)

Unnamed: 0,size,sex_Female
195,2,0
77,2,0
42,2,0
109,2,0
224,3,1
79,2,0
145,2,1
27,2,0
236,2,0
214,2,1


In [None]:
rfe(X,y)

# 4. Swiss

In [None]:
swiss = data('swiss')

In [None]:
swiss.head()

In [None]:
#assign everything to X except Fertility because we are using it as our y
X= swiss.drop(columns=['Fertility'])
#assign y_train 
y = swiss['Fertility']

swiss_kbest = select_kbest(X, y,3)

swis_rfe = rfe(X, y, 3)

In [None]:
swiss_kbest

In [None]:
swis_rfe