In [38]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from pydataset import data

#import warnings
#warnings.filterwarnings("ignore")

# 1.

In [39]:
df = data('tips')

# 1 A.

In [41]:
df.insert(7, "price_per_person",(df.total_bill/df.size), True)

# 1. B.

> ### Size seems most important for predicting `tip` amount

# 1. C. 

In [44]:
def train_val_test(df, target=None, stratify=None, seed=42):
    from sklearn.model_selection import train_test_split

    '''Split data into train, validate, and test subsets with 60/20/20 ratio'''
    
    train, val_test = train_test_split(df, train_size=0.6, random_state=seed)
    
    val, test = train_test_split(val_test, train_size=0.5, random_state=seed)
    
    return train, val, test

In [61]:
train, val, test = train_val_test(df,stratify='tip')

In [62]:
# Useful Variables
categorical_variables = ['sex','smoker','day','time']
continous_variables = ['total_bill','size','price_per_person']
target = ['tip']

#drop = categorical_variables+target

In [73]:
# Isolate Target
X_train_scaled = pd.DataFrame(train[continous_variables])
y_train = train[target]

In [74]:
# Scale Data
mms = MinMaxScaler()

In [75]:
X_train_scaled[continous_variables] = mms.fit_transform(X_train_scaled)
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person
195,0.240346,0.2,0.240346
77,0.270084,0.2,0.270084
42,0.259876,0.2,0.259876
109,0.277186,0.2,0.277186
224,0.227031,0.4,0.227031


In [76]:
f_selector = SelectKBest(f_regression, k=2)

f_selector.fit(X_train_scaled, y_train)

# boolean mask of whether the column was selected or not. 
feature_mask = f_selector.get_support()

  y = column_or_1d(y, warn=True)


In [78]:
f_feature = X_train_scaled.iloc[:,feature_mask].columns.tolist()

# Answer

In [79]:
f_feature

['total_bill', 'price_per_person']

# 1. D.

In [89]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person
195,0.240346,0.2,0.240346
77,0.270084,0.2,0.270084
42,0.259876,0.2,0.259876
109,0.277186,0.2,0.277186
224,0.227031,0.4,0.227031


In [90]:
X_train_scaled = pd.concat([X_train_scaled, train[categorical_variables]], axis=1)

In [91]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person,sex,smoker,day,time
195,0.240346,0.2,0.240346,Male,Yes,Thur,Lunch
77,0.270084,0.2,0.270084,Male,Yes,Sat,Dinner
42,0.259876,0.2,0.259876,Male,No,Sun,Dinner
109,0.277186,0.2,0.277186,Male,No,Sat,Dinner
224,0.227031,0.4,0.227031,Female,No,Fri,Lunch


In [93]:
X_train_scaled = pd.get_dummies(X_train_scaled, columns = categorical_variables)

In [95]:
X_train_scaled.head()

Unnamed: 0,total_bill,size,price_per_person,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
195,0.240346,0.2,0.240346,0,1,0,1,0,0,0,1,0,1
77,0.270084,0.2,0.270084,0,1,0,1,0,1,0,0,1,0
42,0.259876,0.2,0.259876,0,1,1,0,0,0,1,0,1,0
109,0.277186,0.2,0.277186,0,1,1,0,0,1,0,0,1,0
224,0.227031,0.4,0.227031,1,0,1,0,1,0,0,0,0,1


In [96]:
# initialize the ML algorithm
lm = LinearRegression()

# create the rfe object, indicating the ML object (lm) and the number of features I want to end up with. 
rfe = RFE(lm, n_features_to_select=2)

# fit the data using RFE
rfe.fit(X_train, y_train)

# get the mask of the columns selected
feature_mask = rfe.support_

# get list of the column names. 
rfe_feature = X_train.iloc[:,feature_mask].columns.tolist()
rfe_feature

['total_bill', 'size']

In [98]:
var_ranks = rfe.ranking_

# get the variable names
var_names = X_train.columns.tolist()

# combine ranks and names into a df for clean viewing
rfe_ranks_df = pd.DataFrame({'Var': var_names, 'Rank': var_ranks})

# sort the df by rank
rfe_ranks_df.sort_values('Rank')

Unnamed: 0,Var,Rank
0,total_bill,1
1,size,1
2,price_per_person,2


# Answer

In [131]:
rfe_ranks_df['Var'].head(2).tolist()

['total_bill', 'size']

# 1. E.

In [None]:
# Change # of Features

# 2.

In [116]:
X = pd.DataFrame(train[continous_variables])
y = train[target]

In [119]:
def select_kbest(X, y, k=2):
    f_selector = SelectKBest(f_regression, k = k)
    f_selector.fit(X, y)
    mask = f_selector.get_support()
    
    return X.columns[mask].tolist()

In [121]:
select_kbest(X,y)

  y = column_or_1d(y, warn=True)


['total_bill', 'price_per_person']

# 3. 

In [124]:
def rfe(X, y, k=2):
    rf = RFE(LinearRegression(), n_features_to_select = k)
    rf.fit(X, y)
    mask = rf.get_support()
    
    return X.columns[mask].tolist()

In [125]:
rfe(X,y)

['total_bill', 'size']

# 4. Swiss

In [101]:
swiss = data('swiss')

In [102]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [112]:
#assign everything to X except Fertility because we are using it as our y
X= swiss.drop(columns=['Fertility'])
#assign y_train 
y = swiss['Fertility']

swiss_kbest = select_kbest(X, y,3)

swis_rfe = rfe(X, y, 3)

In [115]:
swiss_kbest

Index(['Examination', 'Education', 'Catholic'], dtype='object')

In [114]:
swis_rfe

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')