In [1]:
import numpy as np
import pandas as pd
import random
import uuid
import time
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
# from sklearn.datasets import fetch_mldata
from sklearn.datasets import fetch_openml
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_digits


from PSForest import PSForest
import os
import pickle
import matplotlib.pyplot as plt
import h5py
import scipy
from PIL import Image
from scipy import ndimage
import memory_profiler

Load dataset:
MNIST and CIFAR10

In [2]:
mnist = fetch_openml("mnist_784")
mnist.data.shape

print('Data: {}, target: {}'.format(mnist.data.shape, mnist.target.shape))

Data: (70000, 784), target: (70000,)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    mnist.data,
    mnist.target,
    test_size=1/7,
    random_state=0,
)

X_train = X_train.values.reshape((len(X_train), 784))
X_test = X_test.values.reshape((len(X_test), 784))


#Limit the size of the dataset

X_train = X_train[:1000]
y_train = y_train[:1000]
X_test = X_test[:500]
y_test = y_test[:500]

print('X_train:', X_train.shape, X_train.dtype)
print('y_train:', y_train.shape, y_train.dtype)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)
# X_train

X_train: (1000, 784) float64
y_train: (1000,) category
X_test: (500, 784)
y_test: (500,)


In [2]:
%matplotlib inline
def load_CIFAR_batch(filename):
    with open(filename, 'rb') as f:
        datadict = pickle.load(f,encoding='latin1')
        X = datadict['data']
        Y = datadict['labels']
        X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
        Y = np.array(Y)
    return X, Y
def load_CIFAR10():
    xs = []
    ys = []
    for b in range(1,6):
        f = os.path.join('datasets', 'cifar-10-batches-py', 'data_batch_%d' % (b, ))
        X, Y = load_CIFAR_batch(f)
        xs.append(X)
        ys.append(Y)    
    Xtr = np.concatenate(xs)
    Ytr = np.concatenate(ys)
    del X, Y
    Xte, Yte = load_CIFAR_batch(os.path.join('datasets', 'cifar-10-batches-py', 'test_batch'))
    return Xtr, Ytr, Xte, Yte

In [None]:
X_train, y_train, X_test, y_test = load_CIFAR10()
classes = ['plane', 'car', 'bird', 'cat', 'dear', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
num_each_class = 7

for y, cls in enumerate(classes):
    idxs = np.flatnonzero(y_train == y)
    idxs = np.random.choice(idxs, num_each_class, replace=False)
    for i, idx in enumerate(idxs):
        plt_idx = i * num_classes + (y + 1)
        plt.subplot(num_each_class, num_classes, plt_idx)
        plt.imshow(X_train[idx].astype('uint8'))
        plt.axis('off')
        if i == 0:
            plt.title(cls)
plt.show()
X_train.shape

In [None]:
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
# Divide the sub-data set
y_train = y_train[:1000]
y_test = y_test[:1000]
X_train = X_train[:1000]
X_test = X_test[:1000]
X_train.shape

## Using the PSForest

In [8]:
start =time.clock()
before_mem = memory_profiler.memory_usage()
# Create PSForest model
ps_forest = PSForest(
    estimators_config={
        'mgs': [{
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'max_features': 1,
                'min_samples_split': 10,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'max_features': 1,
                'min_samples_split': 10,
                'n_jobs': -1,
            }
        },{
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        },{
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 'sqrt',
                'n_jobs': -1,
            }
        }],
        'cascade': [{
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 1,
                'oob_score':True,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 'sqrt',
                'oob_score':True,
                'n_jobs': -1,
            }
        }, {
           'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 1,
                'oob_score':True,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 'sqrt',
                'oob_score':True,
                'n_jobs': -1,
            }   
        },{
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 1,
                'oob_score':True,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 'sqrt',
                'oob_score':True,
                'n_jobs': -1,
            }
        }, {
           'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 1,
                'oob_score':True,
                'n_jobs': -1,
            }
        }, {
            'estimator_class': RandomForestClassifier,
            'estimator_params': {
                'n_estimators': 500,
                'min_samples_split': 10,
                'max_features': 'sqrt',
                'oob_score':True,
                'n_jobs': -1,
            }   
        }]
    },
    stride_ratios=[1/256,1/128,1/64,1/32,1/16,1/8,1/4],
)

# ps_forest.fit(X_train, y_train)   # with Multi-Grained Pooling
ps_forest.fit_c(X_train, y_train)  # without Multi-Grained Pooling
after_mem = memory_profiler.memory_usage()
end = time.clock()
print("Memory (Before): {}Mb".format(before_mem))
print("Memory (After): {}Mb".format(after_mem))
print("Memory consumption: {}Mb".format(after_mem[0] - before_mem[0])) 

<Gate-CascadeForest forests=8> - Cascade fitting for X ((1000, 784)) and y ((1000,)) started
<Gate-CascadeForest forests=8> - Level #1:: X with shape: (1000, 784)
<Gate-CascadeForest forests=8> - Level 1:: got all predictions
<Gate-CascadeForest forests=8> - Level 1:: got accuracy 0.87
<Gate-CascadeForest forests=8> - Level #2:: X with shape: (1000, 844)


[0.876, 0.884, 0.876, 0.881, 0.874, 0.892, 0.864, 0.883]
[4, 0]


<Gate-CascadeForest forests=8> - Level 2:: got all predictions
<Gate-CascadeForest forests=8> - Level 2:: got accuracy 0.88
<Gate-CascadeForest forests=8> - Level #3:: X with shape: (1000, 904)


[0.88, 0.881, 0.886, 0.881, 0.881, 0.883, 0.891, 0.885]
[4, 0, 0, 1]


<Gate-CascadeForest forests=8> - Level 3:: got all predictions
<Gate-CascadeForest forests=8> - Level 3:: got accuracy 0.873


[0.886, 0.88, 0.886, 0.876, 0.887, 0.88, 0.886, 0.882]
[4, 0, 0, 1, 1, 3]
Memory (Before): [1854.83984375]Mb
Memory (After): [2223.68359375]Mb
Memory consumption: 368.84375Mb


In [9]:
y_pred = ps_forest.predict(X_test)  # with Multi-Grained Pooling
# y_pred = ps_forest.predict_c(X_test)   # without Multi-Grained Pooling
print('Prediction shape:', y_pred.shape)
print(
    'Accuracy:', accuracy_score(y_test, y_pred),
    'F1 score:', f1_score(y_test, y_pred, average='weighted')
)

print('Running time: %s Seconds'%(end-start))


<MultiGrainedScanner stride_ratio=0.00390625> - Scanning and fitting for X ((500, 784)) and y (None) started
<MultiGrainedScanner stride_ratio=0.00390625> - Window shape: [3] Total windows: 8


(500, 8)


NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [7]:
# RandomForest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
acc = accuracy_score(y_test,  rf_y_pred)
print('accuracy:', acc)

accuracy: 0.898
