# data preparation and modeling pipeine

In [1]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

filename = 'pima-indians-diabetes.data.csv'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
dataframe = read_csv(filename, names = names)
array = dataframe.values

X = array[:,0:8]
Y = array[:,8]

estimators = []
estimators.append(('standardize',StandardScaler()))
estimators.append(('lda',LinearDiscriminantAnalysis()))
model = Pipeline(estimators)

kfold = KFold(n_splits=10,random_state = 7,shuffle=True)

results = cross_val_score(model,X,Y,cv=kfold)

print(results.mean())

0.7669685577580315


# Feature extraction and modeling pipeline

In [3]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

filename = 'pima-indians-diabetes.data.csv'
names = ['preg','pas','pres','skin','test','mass','pedi','age','class']
dataframe = read_csv(filename,names = names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
features = []
features.append(('pca',PCA(n_components=3)))
features.append(('select_best', SelectKBest(k=6)))
feature_union = FeatureUnion(features)
                
estimators = []
estimators.append(('feature_union',feature_union))
estimators.append(('logistic',LogisticRegression(max_iter=1000)))
model = Pipeline(estimators)
                
kfold = KFold(n_splits=10,random_state = 7,shuffle=True)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7721633629528366
