# Ensemble Methods

In [9]:
import pandas as pd
import os

from skimage.transform import resize
from skimage.io import imread
from skimage.feature import hog
from skimage.io import imread
from skimage.transform import rescale
from skimage.color import rgb2gray
from skimage.feature import match_descriptors, plot_matches, SIFT
from skimage import transform


import skimage

import numpy as np
import matplotlib.pyplot as plt

import joblib
from tqdm import tqdm
from collections import Counter
from PIL import Image 

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import sys
sys.path.insert(1, '../../scripts/')

from sklearn.pipeline import Pipeline
from sklearn import svm

from svm_functions import resize_all, RGB2GrayTransformer, HogTransformer


%matplotlib inline

## Import data (pkl)

In [10]:
repo_name = 'Respiratory_Disease_Classification'
cwd_path = os.getcwd()
train_dir = cwd_path.split(repo_name, 1)[0]+repo_name+"/data/images/90-10_split/train"
src = os.listdir(train_dir)[1:]

base_name_train = 'spectrogram_train'
base_name_val = 'spectrogram_val'

width = 80
include = src

In [11]:
data_train = joblib.load(f'{base_name_train}_{width}x{width}px.pkl')
data_val = joblib.load(f'{base_name_val}_{width}x{width}px.pkl')
 
print('number of training samples: ', len(data_train['data']))
print('number of validation samples: ', len(data_val['data']))

print('keys: ', list(data_train.keys()))
print('description: ', data_train['description'])
print('image shape: ', data_train['data'][0].shape)
print('labels:', np.unique(data_train['label']))
 
print("Training data",Counter(data_train['label']))
print("Validation data", Counter(data_val['label']))

number of training samples:  2491
number of validation samples:  998
keys:  ['description', 'label', 'filename', 'data']
description:  resized (80x80)animal images in rgb
image shape:  (80, 80, 3)
labels: ['0' '1']
Training data Counter({'0': 1508, '1': 983})
Validation data Counter({'0': 674, '1': 324})


### Define Train / Test Data

In [14]:
# Define Train-Test, use validation data for testing
X_train = np.array(data_train['data'])
y_train = np.array(data_train['label'])
X_test = np.array(data_val['data'])
y_test = np.array(data_val['label'])

# Necessary pre-processing kernels
1. HOG (histogram oriented gradients)
2. SIFT
3. SURF

In [23]:
# Inistantiate
rgb2gray = RGB2GrayTransformer()
hogify = HogTransformer(
        pixels_per_cell=(14, 14), 
        cells_per_block=(2,2), 
        orientations=9, 
        block_norm='L2-Hys'
        )

scaler = StandardScaler()

# Pipeline

In [34]:
HOG_pipeline = Pipeline([
    ('grayify', RGB2GrayTransformer()),
    ('hogify', HogTransformer(
        pixels_per_cell=(14, 14), 
        cells_per_block=(2, 2), 
        orientations=9, 
        block_norm='L2-Hys')
    ),
    ('scalify', StandardScaler()),
    ('classify', SGDClassifier(random_state=42, max_iter=1000, tol=1e-3))
])
 
clf = HOG_pipeline.fit(X_train, y_train)
print('Percentage correct: ', 100*np.sum(clf.predict(X_test) == y_test)/len(y_test))

Percentage correct:  74.14829659318637


## Grid Search

In [35]:
from sklearn.model_selection import GridSearchCV
 
param_grid = [
    {
        'hogify__orientations': [8, 9],
        'hogify__cells_per_block': [(2, 2), (3, 3)],
        'hogify__pixels_per_cell': [(8, 8), (10, 10), (12, 12)]
    },
    {
        'hogify__orientations': [8],
         'hogify__cells_per_block': [(3, 3)],
         'hogify__pixels_per_cell': [(8, 8)],
         'classify': [
             SGDClassifier(random_state=42, max_iter=1000, tol=1e-3),
             svm.SVC(kernel='linear')
         ]
    }
]

In [36]:
grid_search = GridSearchCV(HOG_pipeline, 
                           param_grid, 
                           cv=3,
                           n_jobs=-1,
                           scoring='accuracy',
                           verbose=1,
                           return_train_score=True)
 
grid_res = grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 14 candidates, totalling 42 fits


## Testing

In [25]:
X_test_gray = rgb2gray.transform(X_test)
X_test_hog = hogify.transform(X_test_gray)
X_test_prepared = scaler.transform(X_test_hog)

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

# Ensemble methods

# Hog