In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE

In [4]:
#importing the datasets we're going to work with

pima_indian_df = pd.read_csv('data/PimaIndians.csv')
pima_indian_df

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,family,age,test
0,1,89,66,23,94,28.1,0.167,21,negative
1,0,137,40,35,168,43.1,2.288,33,positive
2,3,78,50,32,88,31.0,0.248,26,positive
3,2,197,70,45,543,30.5,0.158,53,positive
4,1,189,60,23,846,30.1,0.398,59,positive
...,...,...,...,...,...,...,...,...,...
387,0,181,88,44,510,43.3,0.222,26,positive
388,1,128,88,39,110,36.5,1.057,37,positive
389,2,88,58,26,16,28.4,0.766,22,negative
390,10,101,76,48,180,32.9,0.171,63,negative


In [25]:
#Feature and target datasets
y = pima_indian_df['test']
X = pima_indian_df.drop('test', axis=1)

#Performing a 75% training and 25% test data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [27]:
#instantiating a RandomForestClassifier and fitting it to the training set
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)

acc = accuracy_score(y_test, rf.predict(X_test))

#Printing the importances per feature
print(dict(zip(X.columns, rf.feature_importances_.round(2))))
print("{0:.2%} accuracy on test set.".format(acc))

{'pregnant': 0.07, 'glucose': 0.25, 'diastolic': 0.09, 'triceps': 0.09, 'insulin': 0.14, 'bmi': 0.12, 'family': 0.12, 'age': 0.13}
79.59% accuracy on test set.


In [28]:
# Creating a mask for features importances and the threshold
mask = rf.feature_importances_ > 0.10

reduced_X = X.loc[:, mask]

print(reduced_X.columns)

Index(['glucose', 'insulin', 'bmi', 'family', 'age'], dtype='object')


In [29]:
#wraping a Recursive Feature Eliminator around the random forest model to remove features step by step

rfe = RFE(estimator=RandomForestClassifier(),
         n_features_to_select=2, verbose=1)

rfe.fit(X_train, y_train)

mask = rfe.support_
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

reduced_X = X.loc[:, mask]
print(reduced_X.columns)

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Index(['glucose', 'bmi'], dtype='object')
Index(['glucose', 'bmi'], dtype='object')
