# Recursive Feature Elimination

In [1]:
# explore the number of selected features for RFE
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from matplotlib import pyplot
%matplotlib widget
import common

In [2]:
data = common.loadFile("CleanedData").drop(["VISCODE", "RID"], axis=1).dropna()

In [3]:
# define dataset
X = data.drop("DX", axis=1).to_numpy().astype('int')
y = data.loc[:,['DX']].to_numpy().astype('int').flatten()
steps = 10
# get a list of models to evaluate
def get_models():
	models = dict()
	for i in range(2, steps):
		rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=i)
		model = DecisionTreeClassifier()
		models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
	return models
 
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
	return scores


# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model, X, y)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))



>2 0.796 (0.029)
>3 0.789 (0.024)
>4 0.767 (0.031)
>5 0.755 (0.038)
>6 0.752 (0.027)
>7 0.767 (0.027)
>8 0.756 (0.029)
>9 0.755 (0.026)


In [None]:
# plot model performance for compariso
pyplot.close()
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [5]:
# summarize the dataset
print(X.shape, y.shape)

(1710, 191) (1710,)


## Automatic Select the Number of Features

In [6]:
# create pipeline
rfe = RFECV(estimator=DecisionTreeClassifier())
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.776 (0.036)


In [13]:
# Split the data into training/testing sets
X_train = X[:-20]
X_test = X[-20:]

y_train = y[:-20]
y_test = y[-20:]

def trainModel(steps, estimator):
    best = 0
    bestModel = estimator
    for i in range(1, steps):
        selector = RFECV(estimator, step=1, cv=cv, min_features_to_select=i)
        selector = selector.fit(X_train, y_train)
        if selector.score(X_test, y_test) > best:
            best = selector.score(X_test, y_test)
            bestModel = selector
    return best, bestModel

algorithms = [DecisionTreeClassifier(), LinearDiscriminantAnalysis()]            


interactions = 5 # Number of interactions for each algorithm
score = 0
model = []
algorithmIdex = -1

for i in algorithms:
    tempScore, tempModel = trainModel(interactions, i)
    print(tempScore)
    if tempScore > score:
        score = tempScore
        model = tempModel
        algorithmIdex = algorithms.index(i)

score

0.75
0.6


0.75

In [None]:
model.support_