# Feature Selection

In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

sns.set(style="ticks")

import scipy.stats as ss
from collections import Counter
import math 
from scipy import stats

In [31]:
# seaborn dataset names
print(sns.get_dataset_names())

['anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'exercise', 'flights', 'fmri', 'gammas', 'iris', 'mpg', 'planets', 'tips', 'titanic']




  gh_list = BeautifulSoup(http)


## Cramer

In [32]:
tips = sns.load_dataset("tips")
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [71]:
tips["total_bill_cut"] = pd.cut(tips["total_bill"],
                                np.arange(0, 55, 5),
                                include_lowest=True,
                                right=False)
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_cut
0,16.99,1.01,Female,No,Sun,Dinner,2,"[15, 20)"
1,10.34,1.66,Male,No,Sun,Dinner,3,"[10, 15)"
2,21.01,3.5,Male,No,Sun,Dinner,3,"[20, 25)"
3,23.68,3.31,Male,No,Sun,Dinner,2,"[20, 25)"
4,24.59,3.61,Female,No,Sun,Dinner,4,"[20, 25)"
5,25.29,4.71,Male,No,Sun,Dinner,4,"[25, 30)"
6,8.77,2.0,Male,No,Sun,Dinner,2,"[5, 10)"
7,26.88,3.12,Male,No,Sun,Dinner,4,"[25, 30)"
8,15.04,1.96,Male,No,Sun,Dinner,2,"[15, 20)"
9,14.78,3.23,Male,No,Sun,Dinner,2,"[10, 15)"


In [67]:
tips['size'].unique()

array([2, 3, 4, 1, 6, 5])

### Preprocessing

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [68]:
# Check incomplete rows
incomplete_rows = tips[tips.isnull().any(axis=1)].head()
incomplete_rows

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_bill_cut
170,50.81,10.0,Male,Yes,Sat,Dinner,3,


In [49]:
num_attrs = ["size"]
cat_attrs = list(tips.drop(["total_bill", "tip", "size"], axis=1))
num_attrs, cat_attrs

(['size'], ['sex', 'smoker', 'day', 'time', 'total_bill_cut'])

In [51]:
labels = tips["tip"]

In [60]:
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])
cat_pipeline = num_pipeline = Pipeline([
       ("cat", OneHotEncoder(categories='auto'))
    ])

In [61]:
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", cat_pipeline, cat_attrs),
    ])
tips_preprocessed = full_pipeline.fit_transform(tips)

ValueError: Input contains NaN

In [34]:
# https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

confusion_matrix = pd.crosstab(tips["day"], tips["time"]).values
cramers_v(confusion_matrix)
# Out[10]: 0.93866193407222209

confusion_matrix = pd.crosstab(tips["total_bill_cut"], tips["time"]).values
cramers_v(confusion_matrix)

# Out[24]: 0.16498707494988371

0.1649870749498837

In [35]:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confaausion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

## Theils U

In [15]:
def theils_u(x, y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

## Correlation Ratio

In [16]:
def correlation_ratio(categories, measurements):
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0,cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
    denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = np.sqrt(numerator/denominator)
    return eta

## Pearson Correlation

$$\frac{\sum_{i=1}^{n} (x_i - \overline{x})(y_i - \overline{y})} {\sqrt{\sum_{i=1}^{n} (x_i - \overline{x})^2(y_i - \overline{y})^2}}$$

In [17]:
# https://www.kaggle.com/mlwhiz/feature-selection-using-football-data

In [18]:
def cor_selector(X, y,num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

## Chi-Squared

In [19]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

NameError: name 'X' is not defined

## Recursive Feature Elimination

## Lasso: SelectFromModel

## Tree-based: SelectFromModel

## Combined