### Data Preparation and Modeling Pipeline

In [None]:
# load required libraries
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest


In [None]:
# load the diabetes dataset
filename = "data/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
data = pd.read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
y = array[:, 8]

In [3]:
# create pipeline
estimators = []
estimators.append(("standardize", StandardScaler()))
estimators.append(("lda", LinearDiscriminantAnalysis()))
model = Pipeline(estimators)

In [4]:
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.767 (0.048)


### Feature Extraction and Modeling Pipeline

In [6]:
# create feature union
features = []
features.append(("pca", PCA(n_components=3)))
features.append(("select_best", SelectKBest(k=6)))
feature_union = FeatureUnion(features)

In [7]:
# create pipeline
estimators = []
estimators.append(("feature_union", feature_union))
estimators.append(("logistic", LogisticRegression(solver="liblinear")))
model = Pipeline(estimators)

In [8]:
# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.3f} ({results.std():.3f})")

Accuracy: 0.772 (0.050)
