In [1]:
import os, re, pickle, functools
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_selection import SelectKBest, mutual_info_regression, mutual_info_classif
from sklearn.metrics import r2_score

data_dir = os.pardir + '/data/'
out_dir = os.pardir + '/output/'

## Stage 3: Feature Selection

In [8]:
# Load results from Stage 2 and raw data files
features = pd.read_csv(data_dir+'features.csv')
x_train_simple = pd.read_csv(data_dir+'x_train_simpleImputed.csv', index_col='challengeID')
x_train_knn = pd.read_csv(data_dir+'x_train_knnImputed.csv', index_col='challengeID')
y = pd.read_csv(data_dir+'FFChallenge_v5/train.csv', index_col='challengeID')
y_train = y.loc[x_train_simple.index]

### 3.1 Feature Selection with Mutual Information

In [23]:
def feature_selection_mi(train_x, train_y, outcomes, mutual_info_func, range_of_k):
    '''
    Given train_x, train_y, list of outcomes, mutual information function and a range of values for k,
    select the best k features for each outcome based on mutual_info_func.
    Returns a dictionary of each outcome as key and the value as a nested dictionary
    of k as key and the value as the list of names of the best k selected features
    '''
    outcome_best_k_dict = {}

    for outcome in outcomes:
        outcome_best_k_dict[outcome] = {}
        print(f"Selecting best k features for {outcome}")

        for k in range_of_k:
            y = train_y[outcome].dropna()
            x = train_x.loc[y.index.values]
            X = SelectKBest(mutual_info_func, k=k).fit(x,y)
            mask = X.get_support()
            feature_names = x.columns[mask]
            outcome_best_k_dict[outcome][k] = feature_names

    return outcome_best_k_dict

range_of_k = [50, 100, 250, 500, 1000, 2000]

# Binary variables
outcome_best_k_simple = feature_selection_mi(x_train_simple, y_train, ['eviction', 'layoff', 'jobTraining'], 
                                    mutual_info_classif, range_of_k)
outcome_best_k_knn = feature_selection_mi(x_train_knn, y_train, ['eviction', 'layoff', 'jobTraining'],
                                    mutual_info_classif, range_of_k)
# Continuous variables
outcome_best_k2_simple = feature_selection_mi(x_train_simple, y_train, ['materialHardship', 'gpa', 'grit'],
                                    mutual_info_regression, range_of_k)
outcome_best_k2_knn = feature_selection_mi(x_train_knn, y_train, ['materialHardship', 'gpa', 'grit'],
                                    mutual_info_regression, range_of_k)

outcome_best_k_simple.update(outcome_best_k2_simple)
outcome_best_k_knn.update(outcome_best_k2_knn)

# Save selected features
# with open(data_dir+'outcome_best_k.pkl', 'wb') as f:
#     pickle.dump(outcome_best_k_simple, f)

# with open(data_dir+'outcome_best_k_knn.pkl', 'wb') as f:
#     pickle.dump(outcome_best_k_knn, f)

Selecting best k features for eviction
Selecting best k features for layoff
Selecting best k features for jobTraining
Selecting best k features for materialHardship
Selecting best k features for gpa
Selecting best k features for grit


### 3.2 Feature Selection with LASSO

In [None]:
def feature_selection_lasso(train_x, train_y):
    '''
    Given train_x, train_y, list of outcomes, mutual information function and a range of values for k,
    select the best k features for each outcome using LASSO (non-zero lasso coefficient estimates).
    Returns a dictionary of each outcome as key and the value as a nested dictionary
    of k as key and the value as the list of names of the best k selected features
    '''
    outcomes = ['gpa', 'grit', 'materialHardship', 'eviction', 'layoff', 'jobTraining']
    outcome_alpha_feature = {}

    r2_target = 0.5 # R2 target is set based on literature review
    alphas = np.logspace(-2,1,20).tolist()

    for outcome in outcomes:
        print(f"\nSelecting features for {outcome}")
        outcome_alpha_feature[outcome] = {}
        y = y_train[outcome].dropna()
        X = train_x.loc[y.index.values]
        r_2 = []
        for a in alphas:
            reg = linear_model.Lasso(alpha = a)
            reg.fit(X,y)
            r_2.append(reg.score(X,y))

        reg = linear_model.Lasso()
        path = reg.path(X,y, alphas = alphas)
        n = [np.sum(path[1][:,n] != 0) for n in range(0,len(alphas))]
        r_2.reverse()
        alphas.reverse()

        temp = [abs(i-r2_target) for i in r_2]
        alpha_0 = alphas[temp.index(min(temp))]
        r2 = r_2[temp.index(min(temp))]
        outcome_alpha_feature[outcome]['alpha'] = alpha_0
        outcome_alpha_feature[outcome]['r2'] = r2

        coeff = pd.DataFrame(path[1][:,temp.index(min(temp))],index = X.columns.values)
        feature_index = coeff != 0
        selected_features = X.loc[:,feature_index.iloc[:,0]]
        x_lars = selected_features.columns.values
        outcome_alpha_feature[outcome]['features'] = x_lars
    
    return outcome_alpha_feature

# Save selected features for knn-imputed train
lasso_outcome_alpha_feature_knn_dict = feature_selection_lasso(x_train_knn, y_train)
# with open(data_dir+'lasso_outcome_alpha_feature_knn_dict.pkl', 'wb') as f:
#     pickle.dump(lasso_outcome_alpha_feature_knn_dict, f)

# Save selected features for simple-imputed train
lasso_outcome_alpha_feature_dict = feature_selection_lasso(x_train_simple, y_train)
# with open(data_dir+'lasso_outcome_alpha_feature_dict.pkl', 'wb') as f:
#     pickle.dump(lasso_outcome_alpha_feature_dict, f)