In [1]:
import pandas as pd
import numpy as np

data_frame = pd.read_csv('../data/pima-indians-diabetes.csv')
print(data_frame.columns)

target_column = 'Outcome'
feature_names = data_frame.columns.drop(target_column)
# split by field name
X = data_frame[feature_names]
Y = data_frame[target_column].values

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [2]:
TEST_SIZE = 0.2 # ratio of data to have in test
SEED = 8 # to be used to initialize random number generator, for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_SIZE, random_state=SEED)

In [3]:
# mean absolute error
def mae(y_test, y_pred):
    return np.mean(abs(y_test - y_pred))

def init_model():
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    
    return LinearDiscriminantAnalysis()
    # return RandomForestClassifier(n_estimators=10, random_state=SEED)

In [4]:
import sklearn.metrics as metrics

# train model with all features
model = init_model()
model.fit(X_train, Y_train)

# predict
Y_pred = model.predict(X_test)

# check accuracy
print('With all features')
print(metrics.accuracy_score(Y_test, Y_pred))
print(metrics.precision_score(Y_test, Y_pred))
print(metrics.recall_score(Y_test, Y_pred))
print(metrics.f1_score(Y_test, Y_pred)) 
print(metrics.confusion_matrix(Y_test, Y_pred))

print("MAE = %0.4f" % mae(Y_test, Y_pred))

With all features
0.7987012987012987
0.7333333333333333
0.6346153846153846
0.6804123711340205
[[90 12]
 [19 33]]
MAE = 0.2013


In [5]:
def evaluate_with_num_features(num_features):
    
    print('With %i features' % (num_features))    
    
    # select features with highest scores
    from sklearn.feature_selection import SelectKBest, chi2
    selector = SelectKBest(chi2, k=num_features)
    selector.fit(X_train, Y_train)

    # get scores
    feature_scores = list(zip(data_frame.columns, selector.scores_))
    print(feature_scores)

    # get selected features with names
    print(data_frame.columns[selector.get_support(indices=True)])

    selected_X_train = selector.transform(X_train)

    # train model with selected features
    model = init_model()
    model.fit(selected_X_train, Y_train)

    # predict
    Y_pred = model.predict(selector.transform(X_test))

    # check accuracy
    print(metrics.accuracy_score(Y_test, Y_pred))
    print(metrics.precision_score(Y_test, Y_pred))
    print(metrics.recall_score(Y_test, Y_pred))
    print(metrics.f1_score(Y_test, Y_pred)) 
    print(metrics.confusion_matrix(Y_test, Y_pred))
    print("MAE = %0.4f" % mae(Y_test, Y_pred))

In [6]:
for depth in range(1,X_train.shape[1]):
    evaluate_with_num_features(depth)

With 1 features
[('Pregnancies', 81.5625440912852), ('Glucose', 1100.995801674656), ('BloodPressure', 11.596627661284971), ('SkinThickness', 104.36813534225452), ('Insulin', 1707.9625372686578), ('BMI', 124.65749720367606), ('DiabetesPedigreeFunction', 4.500261588452462), ('Age', 111.3301183186772)]
Index(['Insulin'], dtype='object')
0.6948051948051948
0.8571428571428571
0.11538461538461539
0.20338983050847456
[[101   1]
 [ 46   6]]
MAE = 0.3052
With 2 features
[('Pregnancies', 81.5625440912852), ('Glucose', 1100.995801674656), ('BloodPressure', 11.596627661284971), ('SkinThickness', 104.36813534225452), ('Insulin', 1707.9625372686578), ('BMI', 124.65749720367606), ('DiabetesPedigreeFunction', 4.500261588452462), ('Age', 111.3301183186772)]
Index(['Glucose', 'Insulin'], dtype='object')
0.7727272727272727
0.7741935483870968
0.46153846153846156
0.5783132530120482
[[95  7]
 [28 24]]
MAE = 0.2273
With 3 features
[('Pregnancies', 81.5625440912852), ('Glucose', 1100.995801674656), ('BloodPre