# Picking Features
### Before I do any modeling, I will have to determine which features are important, which is what this file will accomplish.

In [22]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [23]:
data = pd.read_csv('diabetes.csv')
df = data.copy()
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [24]:
def get_data_splits(dataframe, valid_fraction=0.1):
    valid_fraction = 0.1
    valid_size = int(len(dataframe) * valid_fraction)

    train = dataframe[:-valid_size * 2]
    # valid size == test size, last two sections of the data
    valid = dataframe[-valid_size * 2:-valid_size]
    test = dataframe[-valid_size:]
    
    return train, valid, test

In [25]:
def train_model(train, valid):
    feature_cols = train.columns.drop('outcome')

    dtrain = lgb.Dataset(train[feature_cols], label=train['outcome'])
    dvalid = lgb.Dataset(valid[feature_cols], label=valid['outcome'])

    param = {'num_leaves': 64, 'objective': 'binary', 
             'metric': 'auc', 'seed': 7}
    print("Training model!")
    bst = lgb.train(param, dtrain, num_boost_round=1000, valid_sets=[dvalid], 
                    early_stopping_rounds=10, verbose_eval=False)

    valid_pred = bst.predict(valid[feature_cols])
    valid_score = metrics.roc_auc_score(valid['outcome'], valid_pred)
    print(f"Validation AUC score: {valid_score:.4f}")
    return bst

In [26]:
# I have a general idea of which features are good, but I will use this feature selector to confirm my suspicions
from sklearn.feature_selection import SelectKBest, f_classif
feature_cols = df.columns.drop('Outcome')
train, valid, _ = get_data_splits(df)

# Keep 5 features
selector = SelectKBest(f_classif, k=5)

X_new = selector.fit_transform(train[feature_cols], train['Outcome'])

# Get back the good features, zero out all other trash features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train.index, 
                                 columns=feature_cols)
# Dropped columns have values of all 0s, so drop them
selected_columns = selected_features.columns[selected_features.var() != 0]

# Get the valid dataset with the selected features.
sfeatures_dataset = valid[selected_columns]
sfeatures_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BMI,DiabetesPedigreeFunction,Age
616,6,117,28.7,0.157,30
617,2,68,20.1,0.257,23
618,9,112,28.2,1.282,50
619,0,119,32.4,0.141,24
620,2,112,38.4,0.246,28


In [27]:
print('The 5 most important features: ')
print(selected_columns.values)

The 5 most important features: 
['Pregnancies' 'Glucose' 'BMI' 'DiabetesPedigreeFunction' 'Age']


## Selecting best features for logistic regression

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

train, valid, _ = get_data_splits(df)

X, y = train[train.columns.drop("Outcome")], train['Outcome']

# Set the regularization parameter C=1
logistic = LogisticRegression(C=1, penalty="l1", solver='liblinear', random_state=7).fit(X, y)
model = SelectFromModel(logistic, prefit=True)

X_new = model.transform(X)

# Get back the kept features as a DataFrame with dropped columns as all 0s
selected_features = pd.DataFrame(model.inverse_transform(X_new), 
                                 index=X.index,
                                 columns=X.columns)

# Dropped columns have values of all 0s, keep other columns 
selected_columns = selected_features.columns[selected_features.var() != 0]
print('The most important features: ')
print(selected_columns.values)

The most important features: 
['Pregnancies' 'Glucose' 'BloodPressure' 'SkinThickness' 'Insulin' 'BMI'
 'DiabetesPedigreeFunction' 'Age']


In [31]:
### All of the most important features are the same as each column name in the dataset. I will try to narrow it down a bit by selecting the most important features for logistic regression. 

In [29]:
# Trying to hyperparameter
from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['9', '10','11','l2', '13', '14', '15']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

Best: 0.778777 using {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}


In [30]:
# Revised features selector
train, valid, _ = get_data_splits(df)

X, y = train[train.columns.drop("Outcome")], train['Outcome']

logistic = LogisticRegression(C=1, penalty="l2", solver='lbfgs', random_state=7).fit(X, y)
model = SelectFromModel(logistic, prefit=True)

X_new = model.transform(X)

selected_features = pd.DataFrame(model.inverse_transform(X_new), 
                                 index=X.index,
                                 columns=X.columns)
 
selected_columns = selected_features.columns[selected_features.var() != 0]
print('The most important features: ')
print(selected_columns.values)

The most important features: 
['DiabetesPedigreeFunction']
