In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [28]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'
credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")

credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.sample(5)

credData.replace('?', np.nan, inplace=True)
newCred = credData.dropna(axis=0)

X = newCred.loc[:, 0:14]
y = newCred.loc[:, 15]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [29]:
catTrans = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [30]:
numTrans = Pipeline(steps=[('scaler', StandardScaler())])

In [31]:
X.dtypes

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
dtype: object

In [32]:
numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
numFeatures

Int64Index([1, 2, 7, 10, 13, 14], dtype='int64')

In [33]:
catFeatures = X.select_dtypes(include=['object']).columns
catFeatures

Int64Index([0, 3, 4, 5, 6, 8, 9, 11, 12], dtype='int64')

In [34]:
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numTrans, numFeatures),
    ('categoric', catTrans, catFeatures)
])

In [35]:
X_tran_train = pd.DataFrame(preprocessor.fit_transform(X_train))
X_tran_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.105658,-0.4449,1.377002,-0.553206,0.570065,-0.174241,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-1.084238,1.115032,-0.528306,-0.553206,-0.60247,-0.167337,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.416675,-0.080916,0.592889,-0.327276,-0.367963,-0.174241,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,-0.795428,1.418699,-0.189778,-0.553206,-0.485217,0.024974,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-1.125497,0.439061,-0.636809,-0.553206,-0.25071,-0.174241,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [36]:
X_tran_test = pd.DataFrame(preprocessor.transform(X_test))
X_tran_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,-0.059376,-0.531217,-0.623789,-0.553206,0.687319,-0.174241,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-1.063609,-0.878562,-0.600642,-0.327276,0.101051,-0.174076,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,0.64862,1.929316,1.847181,0.802371,-0.661097,-0.174241,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,2.203242,3.402933,2.245025,2.383877,-1.071485,0.927028,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4,-0.451332,-0.644572,-0.612215,-0.553206,-0.485217,-0.174241,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


## Adding Dimensionality Reduction to the Feature Extraction Pipeline

In [37]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'
credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")

credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.sample(5)

credData.replace('?', np.nan, inplace=True)
newCred = credData.dropna(axis=0)

X = newCred.loc[:, 0:14]
y = newCred.loc[:, 15]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

catTrans = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
numTrans = Pipeline(steps=[('scaler', StandardScaler())])

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
catFeatures = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numTrans, numFeatures),
    ('categoric', catTrans, catFeatures)
])

In [40]:
# Reduzir o dataset para 10 dimensões
estimator = Pipeline(steps=[('preprocessor', preprocessor), ('dimred', PCA(10))])

In [41]:
X_tran_train = pd.DataFrame(estimator.fit_transform(X_train))
X_tran_test = pd.DataFrame(estimator.transform(X_test))

print(X_tran_train.shape)
print(X_tran_test.shape)

(457, 10)
(196, 10)


## Modeling and Predictions Using ML Pipelines

In [48]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'
credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")

credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.sample(5)

credData.replace('?', np.nan, inplace=True)
newCred = credData.dropna(axis=0)

X = newCred.loc[:, 0:14]
y = newCred.loc[:, 15].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

catTrans = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
numTrans = Pipeline(steps=[('scaler', StandardScaler())])

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
catFeatures = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numTrans, numFeatures),
    ('categoric', catTrans, catFeatures)
])

In [49]:
estimator = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('dimred', PCA(10)),
    ('clf', LogisticRegression(random_state=123))
])

In [50]:
estimator.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  Int64Index([1, 2, 7, 10, 13, 14], dtype='int64')),
                                                 ('categoric',
                     

In [51]:
estimator.score(X_test, y_test)

0.8877551020408163

In [52]:
preds = estimator.predict(X_test)

In [54]:
print(confusion_matrix(y_test, preds))

[[96 11]
 [11 78]]


In [55]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       107
           1       0.88      0.88      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



## Spot-Checking Models Using ML Pipelines

In [57]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'
credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")

credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.sample(5)

credData.replace('?', np.nan, inplace=True)
newCred = credData.dropna(axis=0)

X = newCred.loc[:, 0:14]
y = newCred.loc[:, 15].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

catTrans = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
numTrans = Pipeline(steps=[('scaler', StandardScaler())])

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
catFeatures = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numTrans, numFeatures),
    ('categoric', catTrans, catFeatures)
])

In [58]:
classifiers = [
    KNeighborsClassifier(5),
    RandomForestClassifier(random_state=123),
    AdaBoostClassifier(random_state=123),
    LogisticRegression(random_state=123)
]

In [64]:
for classifier in classifiers:
    estimator = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('dimred', PCA(10)),
        ('classifier', classifier)
    ])

    estimator.fit(X_train, y_train)
    print(classifier)
    print('Model score: %.2f\n\n\n' %estimator.score(X_test, y_test))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Model score: 0.83



RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)
Model score: 0.86



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=123)
Model score: 0.86



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept

#### Grid Search and Cross-Validation with ML Pipelines

In [2]:
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'
credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")

credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.sample(5)

credData.replace('?', np.nan, inplace=True)
newCred = credData.dropna(axis=0)

X = newCred.loc[:, 0:14]
y = newCred.loc[:, 15].astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

catTrans = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
numTrans = Pipeline(steps=[('scaler', StandardScaler())])

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
catFeatures = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numTrans, numFeatures),
    ('categoric', catTrans, catFeatures)
])

In [3]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('dimred', PCA(10)),
    ('classifier', AdaBoostClassifier(random_state=123))
])

In [5]:
# __ (double underscore) faz a associação entre os parâmetros
param_grid = {'dimred__n_components': [10, 12, 15],
             'classifier__n_estimators': [50, 100, 200],
             'classifier__learning_rate': [0.7, 0.6, 1.0]}

In [7]:
estimator = GridSearchCV(pipe, param_grid=param_grid, cv=10)

In [8]:
estimator.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('numeric',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('scaler',
                                                                                          StandardScaler(copy=True,
                                                                                                         with_mean=True,
                                   

In [9]:
estimator.best_params_

{'classifier__learning_rate': 0.7,
 'classifier__n_estimators': 50,
 'dimred__n_components': 15}

In [10]:
preds = estimator.predict(X_test)

In [11]:
print(confusion_matrix(y_test, preds))

[[90 17]
 [14 75]]


In [12]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.87      0.84      0.85       107
           1       0.82      0.84      0.83        89

    accuracy                           0.84       196
   macro avg       0.84      0.84      0.84       196
weighted avg       0.84      0.84      0.84       196

