In [1]:
import pandas as pd

data_frame = pd.read_csv('../data/pima-indians-diabetes.csv')
print(data_frame.columns)

target_column = 'Outcome'
feature_names = data_frame.columns.drop(target_column)
# split by field name
X = data_frame[feature_names]
Y = data_frame[target_column].values

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [2]:
TEST_SIZE = 0.2 # ratio of data to have in test
SEED = 8 # to be used to initialize random number generator, for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=SEED)

In [3]:
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier

# train model with all features
model = RandomForestClassifier(n_estimators=10, random_state=SEED)
model.fit(X_train, Y_train)

# predict
Y_pred = model.predict(X_test)

# check accuracy
print('With all features')
print(metrics.accuracy_score(Y_test, Y_pred))
print(metrics.precision_score(Y_test, Y_pred))
print(metrics.recall_score(Y_test, Y_pred))
print(metrics.f1_score(Y_test, Y_pred)) 
print(metrics.confusion_matrix(Y_test, Y_pred))

With all features
0.7532467532467533
0.6590909090909091
0.5576923076923077
0.6041666666666667
[[87 15]
 [23 29]]


In [4]:
def evaluate_with_num_features(num_features):
    
    # select features with highest scores
    from sklearn.feature_selection import SelectKBest, chi2
    selector = SelectKBest(chi2, k=num_features)
    selector.fit(X_train, Y_train)

    # get scores
    feature_scores = list(zip(data_frame.columns, selector.scores_))
    print(feature_scores)

    # get selected features with names
    print(data_frame.columns[selector.get_support(indices=True)])

    selected_X_train = selector.transform(X_train)

    # train model with selected features
    model = RandomForestClassifier(n_estimators=10, random_state=SEED)
    model.fit(selected_X_train, Y_train)

    # predict
    Y_pred = model.predict(selector.transform(X_test))

    # check accuracy
    print('With %i features' % (num_features))
    print(metrics.accuracy_score(Y_test, Y_pred))
    print(metrics.precision_score(Y_test, Y_pred))
    print(metrics.recall_score(Y_test, Y_pred))
    print(metrics.f1_score(Y_test, Y_pred)) 
    print(metrics.confusion_matrix(Y_test, Y_pred))

In [5]:
evaluate_with_num_features(1)

[('Pregnancies', 81.5625440912852), ('Glucose', 1100.995801674656), ('BloodPressure', 11.596627661284971), ('SkinThickness', 104.36813534225452), ('Insulin', 1707.9625372686578), ('BMI', 124.65749720367606), ('DiabetesPedigreeFunction', 4.500261588452462), ('Age', 111.3301183186772)]
Index(['Insulin'], dtype='object')
With 1 features
0.6038961038961039
0.3902439024390244
0.3076923076923077
0.34408602150537637
[[77 25]
 [36 16]]
