In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from operator import itemgetter
import MSCO
import random

In [None]:
# create arbitrary dataset for classification
# see link below for description of parameters
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html

#X, y = make_classification(n_samples=300, n_features=15,
                           #n_informative=8, random_state=0,
                           #shuffle=True)
#print(X)
#np.savetxt('dataset_X.txt',X)
#np.savetxt('dataset_y.txt',y)

X = np.loadtxt('dataset_X.txt')
y = np.loadtxt('dataset_y.txt')

In [None]:
# to roughly replicate a common scenario in which the most insightful
# features are the most expensive to generate, we artificially create
# a linear relationship between feature benefit and cost. Feature cost
# is accordingly defined as rand(100,200)*importance for each feature. In practice,
# consider computing average generation times for each of your features to
# supply as costs
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X,y)
benefits = clf.feature_importances_
#costs = [random.randrange(100,200)*b for b in benefits]
costs = [19.764084307310753, 6.7254109270248605, 3.7797435714026864, 3.724685620315442, 3.1303084987410643,
         2.550500690125089, 18.461302864749804, 4.77635968376175, 10.574614992863532, 5.491503520160602,
         40.7509988879539, 11.186903480202794, 5.26676291399816, 21.501672451850364, 2.563533977411992]


In [None]:
# MSCO uses pandas dataframes for storing/manipulating train/test data
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

X = pd.DataFrame.from_records(X)

In [None]:
# we use a single-stage model as a frame of reference for performance
# we average performance over 5 iterations 

sstage_partition = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
sstage_partition = [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 2, 1, 2]
sstage_partition = [3, 2, 1, 0, 3, 1, 0, 3, 3, 1, 4, 0, 2, 1, 0]
sstage_partition  = [0, 2, 1, 0, 0, 0, 4, 1, 1, 2, 1, 0, 1, 2, 3]
sstage_partiiton = [0, 2, 1, 0, 1, 1, 1, 0, 2, 0, 0, 2, 0, 2, 2]

sstage_performance = 0.0
sstage_clf = RandomForestClassifier(n_estimators=50)
print("single stage")
for i in range(0,5):
    per = MSCO.staged_classify(sstage_clf, X, y, sstage_partition, costs, train_percent=.75)
    sstage_performance += MSCO.pm_euclidean(per)
    print(MSCO.pm_euclidean(per))
    print("iter={} performance={}\n".format(i,per))

sstage_performance /= 5
print("single stage performance: {}".format(sstage_performance))

In [None]:
# now we use our first "increasing cost" heuristic
# with a maximum possible five stages

jstage_performance = 0.0
jstage_clf = RandomForestClassifier(n_estimators=100)
print("jenks stage heuristic")
for i in range(0,5):
    per = MSCO.jenks_stages(jstage_clf, X, y, costs, 3,
                 min_max_norm=True, prob_thresh=.75,
                 train_percent=.75)
    best =  max(per, key=lambda x: x[1])
    jstage_performance += best[1]
    print("iter={} performance={}\n".format(i,best))
jstage_performance /= 5
print("jenks stage performance: {}".format(jstage_performance))

In [None]:
# use our second n-stage "increasing cost" heuristic

nstage_performance = 0.0
nstage_clf = RandomForestClassifier(n_estimators=100)
print("n-stage stage heuristic")
for i in range(5):
    per = MSCO.n_stages(nstage_clf, X, y, costs,
                 min_max_norm=True, prob_thresh=.75,
                 train_percent=.75)
    nstage_performance += per
    print("iter={} performance={}\n".format(i,per))
nstage_performance /= 5
print("n stage performance: {}".format(nstage_performance))

In [None]:
# Use local beam search
# We generate 50 solutions for each of the 5 generations

beam_performance = 0.0
beam_clf = RandomForestClassifier(n_estimators=100)
beam_partition = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
print("beam search")
MSCO.beam(beam_clf, X, y, beam_partition,
          costs, pop_size=70, max_iter=10)

In [None]:
stoch_clf = RandomForestClassifier(n_estimators=100)
sols = []
for j in range(5):
    for i in range(15):
        stoch_part = MSCO.stochastic_assn(15,3,costs,benefits)

        stoch_perf = MSCO.pm_euclidean(MSCO.staged_classify(stoch_clf, X, y, stoch_part, costs))
        sols.append([stoch_part, stoch_perf])    
        print("iter={} performance={} partition={}\n".format(i,stoch_perf, stoch_part))
    
    print(max(sols, key=itemgetter(1)))
print(max(sols, key=itemgetter(1)))

In [None]:
det_clf = RandomForestClassifier(n_estimators=100)
MSCO.deterministic_assn(det_clf, X, y, 3, [0 for i in range(0,15)], costs,5)