# Feature Selection

In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(style="ticks")

import scipy.stats as ss
from collections import Counter
import math 
from scipy import stats

In [31]:
# seaborn dataset names
print(sns.get_dataset_names())

['anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'iris', 'mpg', 'planets', 'tips', 'titanic']




  gh_list = BeautifulSoup(http)


## Dataset

In [173]:
tips = sns.load_dataset("tips")
#tips["tip"] = pd.to_numeric(tips["tip"])
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [174]:
tips["total_bill_cut"] = pd.cut(tips["total_bill"],
                                np.arange(0, 60, 5),
                                include_lowest=True,
                                right=False)

In [175]:
tips.dtypes

total_bill         float64
tip                float64
sex               category
smoker            category
day               category
time              category
size                 int64
total_bill_cut    category
dtype: object

In [176]:
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_cut
0,16.99,1.01,Female,No,Sun,Dinner,2,"[15, 20)"
1,10.34,1.66,Male,No,Sun,Dinner,3,"[10, 15)"
2,21.01,3.5,Male,No,Sun,Dinner,3,"[20, 25)"
3,23.68,3.31,Male,No,Sun,Dinner,2,"[20, 25)"
4,24.59,3.61,Female,No,Sun,Dinner,4,"[20, 25)"
5,25.29,4.71,Male,No,Sun,Dinner,4,"[25, 30)"
6,8.77,2.0,Male,No,Sun,Dinner,2,"[5, 10)"
7,26.88,3.12,Male,No,Sun,Dinner,4,"[25, 30)"
8,15.04,1.96,Male,No,Sun,Dinner,2,"[15, 20)"
9,14.78,3.23,Male,No,Sun,Dinner,2,"[10, 15)"


In [177]:
tips['size'].unique()

array([2, 3, 4, 1, 6, 5])

In [178]:
len(tips)

244

### Preprocessing

In [74]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [76]:
# Check incomplete rows
incomplete_rows = tips[tips.isnull().any(axis=1)].head()
incomplete_rows

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_cut


In [108]:
num_attrs = [""]
cat_attrs = list(tips.drop(["total_bill", "tip"], axis=1))
num_attrs, cat_attrs

([''], ['sex', 'smoker', 'day', 'time', 'size', 'total_bill_cut'])

In [181]:
labels = tips["tip"].astype("float64")

In [79]:
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])
cat_pipeline = num_pipeline = Pipeline([
       ("cat", OneHotEncoder(categories='auto'))
    ])

In [80]:
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", cat_pipeline, cat_attrs),
    ])
tips_preprocessed = full_pipeline.fit_transform(tips)

In [84]:
tips_preprocessed.shape

(244, 27)

In [87]:
# Confusion matrix
from sklearn.metrics import confusion_matrix

In [90]:
len(labels)

244

### Cramer

In [34]:
# https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

confusion_matrix = pd.crosstab(tips["day"], tips["time"]).values
cramers_v(confusion_matrix)
# Out[10]: 0.93866193407222209

confusion_matrix = pd.crosstab(tips["total_bill_cut"], tips["time"]).values
cramers_v(confusion_matrix)

# Out[24]: 0.16498707494988371

0.1649870749498837

In [35]:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confaausion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

## Theils U

In [15]:
def theils_u(x, y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

## Correlation Ratio

In [16]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta

### Preprocessing

In [109]:
# df = pd.concat([ tips[num_attrs] + pd.get_dummies(tips[cat_attrs]) ], axis=1)
df = pd.concat([ pd.get_dummies(tips[cat_attrs]) ], axis=1)
df.shape, df.columns

((244, 22),
 Index(['size', 'sex_Male', 'sex_Female', 'smoker_Yes', 'smoker_No', 'day_Thur',
        'day_Fri', 'day_Sat', 'day_Sun', 'time_Lunch', 'time_Dinner',
        'total_bill_cut_[0, 5)', 'total_bill_cut_[5, 10)',
        'total_bill_cut_[10, 15)', 'total_bill_cut_[15, 20)',
        'total_bill_cut_[20, 25)', 'total_bill_cut_[25, 30)',
        'total_bill_cut_[30, 35)', 'total_bill_cut_[35, 40)',
        'total_bill_cut_[40, 45)', 'total_bill_cut_[45, 50)',
        'total_bill_cut_[50, 55)'],
       dtype='object'))

In [113]:
df.head(2)

Unnamed: 0,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,...,"total_bill_cut_[5, 10)","total_bill_cut_[10, 15)","total_bill_cut_[15, 20)","total_bill_cut_[20, 25)","total_bill_cut_[25, 30)","total_bill_cut_[30, 35)","total_bill_cut_[35, 40)","total_bill_cut_[40, 45)","total_bill_cut_[45, 50)","total_bill_cut_[50, 55)"
0,2,0,1,0,1,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1,3,1,0,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [132]:
type(labels)

pandas.core.series.Series

In [133]:
#labels = pd.DataFrame(labels)

## Pearson Correlation

$$\frac{\sum_{i=1}^{n} (x_i - \overline{x})(y_i - \overline{y})} {\sqrt{\sum_{i=1}^{n} (x_i - \overline{x})^2(y_i - \overline{y})^2}}$$

In [92]:
# Number of features to select
num_features = 8

In [17]:
# https://www.kaggle.com/mlwhiz/feature-selection-using-football-data

In [134]:
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [182]:
cor_support, cor_feature = cor_selector(df, labels, num_features)

In [183]:
print(str(len(cor_feature)), 'selected features')

8 selected features


In [184]:
cor_feature

['total_bill_cut_[35, 40)',
 'total_bill_cut_[30, 35)',
 'total_bill_cut_[5, 10)',
 'total_bill_cut_[25, 30)',
 'total_bill_cut_[45, 50)',
 'total_bill_cut_[50, 55)',
 'total_bill_cut_[10, 15)',
 'size']

## Chi-Squared

In [185]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [186]:
df_norm = MinMaxScaler().fit_transform(df)

  return self.partial_fit(X, y)


In [233]:
chi_selector = SelectKBest(chi2, k=num_features)
chi_selector, df_norm.shape, labels.shape

(SelectKBest(k=8, score_func=<function chi2 at 0x1a1fb12ea0>),
 (244, 22),
 (244,))

In [242]:
chi_selector.fit(df_norm, labels.astype('int'))
#chi_selector.fit?
#SelectKBest?

SelectKBest(k=8, score_func=<function chi2 at 0x1a1fb12ea0>)

In [243]:
chi_support = chi_selector.get_support()
chi_feature = df.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

8 selected features


In [244]:
chi_feature

['total_bill_cut_[5, 10)',
 'total_bill_cut_[10, 15)',
 'total_bill_cut_[15, 20)',
 'total_bill_cut_[25, 30)',
 'total_bill_cut_[30, 35)',
 'total_bill_cut_[35, 40)',
 'total_bill_cut_[45, 50)',
 'total_bill_cut_[50, 55)']

## Recursive Feature Elimination

In [245]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_features, step=10, verbose=5)
rfe_selector.fit(df_norm, labels.astype("int"))

Fitting estimator with 22 features.
Fitting estimator with 12 features.




RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
  n_features_to_select=8, step=10, verbose=5)

In [246]:
rfe_support = rfe_selector.get_support()
rfe_feature = df.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

8 selected features


In [248]:
rfe_feature

['size',
 'sex_Female',
 'smoker_No',
 'time_Dinner',
 'total_bill_cut_[5, 10)',
 'total_bill_cut_[10, 15)',
 'total_bill_cut_[15, 20)',
 'total_bill_cut_[20, 25)']

## Lasso: SelectFromModel

In [250]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_features)
embeded_lr_selector.fit(df_norm, labels.astype("int"))

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = df.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

8 selected features




In [251]:
embeded_lr_feature

['size',
 'smoker_No',
 'day_Sat',
 'total_bill_cut_[5, 10)',
 'total_bill_cut_[10, 15)',
 'total_bill_cut_[15, 20)',
 'total_bill_cut_[20, 25)',
 'total_bill_cut_[25, 30)']

## Tree-based: SelectFromModel

In [254]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_features)
embeded_rf_selector.fit(df, labels.astype("int"))

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = df.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')

8 selected features


In [256]:
embeded_rf_feature

['size',
 'sex_Male',
 'smoker_Yes',
 'smoker_No',
 'total_bill_cut_[5, 10)',
 'total_bill_cut_[10, 15)',
 'total_bill_cut_[15, 20)',
 'total_bill_cut_[20, 25)']

In [257]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_features)
embeded_lgb_selector.fit(df, labels)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = df.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

ModuleNotFoundError: No module named 'lightgbm'

## Combined

In [260]:
pd.set_option('display.max_rows', None)
# put all selection together
feature_selection_df = pd.DataFrame({'Feature':df.columns, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,
                                    'Random Forest':embeded_rf_support}) #, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_features)

Unnamed: 0,Feature,Pearson,Chi-2,RFE,Logistics,Random Forest,Total
1,"total_bill_cut_[5, 10)",True,True,True,True,True,5
2,"total_bill_cut_[10, 15)",True,True,True,True,True,5
3,"total_bill_cut_[15, 20)",False,True,True,True,True,4
4,size,True,False,True,True,True,4
5,"total_bill_cut_[25, 30)",True,True,False,True,False,3
6,"total_bill_cut_[20, 25)",False,False,True,True,True,3
7,smoker_No,False,False,True,True,True,3
8,"total_bill_cut_[50, 55)",True,True,False,False,False,2
