In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor,plot_tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import timeit


In [None]:
%run IP_functions.ipynb 

In [None]:
def make_continuous_variable_multi(estimators,m,in_leaf_var,num_total_features,eps,feature_lower_bounds,feature_upper_bounds):
    
    feature_var={}
    #The variable x_i in the model
    split_var={}
    last_var={}
    n_trees=len(estimators)
    
    upper_bound_leaf={}
    lower_bound_leaf={}
    
    for tree_index,tree_in_forest in enumerate(estimators):

        tree=tree_in_forest.tree_
        num_nodes=tree.node_count
        ## leaf nodes
        index_leaves=np.array(get_leaf_indices(tree))
        n_leaves=index_leaves.size

        # 0 for left, 1 for right to all leaves in a dictionary
        binary_path=get_binary_path_to_leaf(np.ones(num_nodes)*(-1),tree,0)

        m.update()
        
        upper_bound_leaf[tree_index]=np.ones([num_nodes,num_total_features])
        lower_bound_leaf[tree_index]=np.ones([num_nodes,num_total_features])
        
        for leaf in index_leaves:

            index_right=np.where(binary_path[leaf]==1)[0]
            index_left=np.where(binary_path[leaf]==0)[0]
            upper_threshold=tree.threshold[index_left]
            lower_threshold=tree.threshold[index_right]
            feature_split_left=tree.feature[index_left]
            feature_split_right=tree.feature[index_right]
       

            for feat in range(num_total_features):
                if feat in feature_split_left:
                    split_loc=np.where(feature_split_left==feat)[0]
                    upper_bound_leaf[tree_index][leaf,feat]=min(upper_threshold[split_loc])
                else: upper_bound_leaf[tree_index][leaf,feat] = feature_upper_bounds[feat]

                if feat in feature_split_right:
                    split_loc=np.where(feature_split_right==feat)[0]
                    lower_bound_leaf[tree_index][leaf,feat]=max(lower_threshold[split_loc])
                else: lower_bound_leaf[tree_index][leaf,feat] = feature_lower_bounds[feat]
            
    
    for feat in range(num_total_features):
        feat_indices_all=[]
        tree_indices_all=[]
        feat_thresholds_all=[]

        for tree_index,tree_in_forest in enumerate(estimators):

            tree=tree_in_forest.tree_
            feat_indices=np.where(tree.feature==feat)[0]
            feat_thresholds=tree.threshold[tree.feature==feat]
            order_indices=np.argsort(feat_thresholds)

            for i in range(len(feat_indices)):
                split_var[feat,feat_indices[order_indices][i],tree_index]=m.addVar(vtype=GRB.BINARY,name="split_var"+str(feat)+"-"+str(feat_indices[order_indices][i])+"_tree"+str(tree_index))

            
            feat_indices_all=np.append(feat_indices_all,feat_indices)
            feat_thresholds_all=np.append(feat_thresholds_all,feat_thresholds)
            tree_indices_all=np.append(tree_indices_all,np.ones(len(feat_indices))*tree_index)
            
        last_var[feat]=m.addVar(vtype=GRB.BINARY,name="last_var"+str(feat))
        
        indices_to_sort=np.argsort(feat_thresholds_all)
        ordered_thresholds=feat_thresholds_all[indices_to_sort]
        ordered_tree_indices=tree_indices_all[indices_to_sort]
        ordered_indices=feat_indices_all[indices_to_sort]
        
        m.addConstr(last_var[feat]+quicksum(split_var[feat,ordered_indices[k],ordered_tree_indices[k]] for k in range(len(indices_to_sort)))==1)
        
        for tree_index,tree_in_forest in enumerate(estimators):
            
            tree=tree_in_forest.tree_
            index_leaves=np.array(get_leaf_indices(tree))
            
            for i in range(len(indices_to_sort)):
                for j in range(len(indices_to_sort)):
                    if i<=j:
                        incl_leaf=[]
                        for leaf in index_leaves:
                            if i==0:
                                if ordered_thresholds[j]>=upper_bound_leaf[tree_index][leaf,feat]:
                                    incl_leaf.append(leaf)
                            else:
                                if ordered_thresholds[i-1]<=lower_bound_leaf[tree_index][leaf,feat]+eps:
                                    if ordered_thresholds[j]>=upper_bound_leaf[tree_index][leaf,feat]:
                                        incl_leaf.append(leaf)

                        m.addConstr(quicksum(split_var[feat,ordered_indices[k],ordered_tree_indices[k]]for k in np.arange(i,j+1)) 
                           >= quicksum(in_leaf_var[leaf,tree_index] for leaf in incl_leaf))
                
                # last j
                incl_leaf=[]
                for leaf in index_leaves:
                    if i==0:
                        incl_leaf.append(leaf)
                    else:
                        if ordered_thresholds[i-1]<=lower_bound_leaf[tree_index][leaf,feat]:
                            incl_leaf.append(leaf)
                            
                m.addConstr(last_var[feat]+quicksum(split_var[feat,ordered_indices[k],ordered_tree_indices[k]]for k in np.arange(i,len(indices_to_sort))) 
                   >= quicksum(in_leaf_var[leaf,tree_index] for leaf in incl_leaf))
                
            #last i and j
            incl_leaf=[]
            if len(indices_to_sort)>0:
                for leaf in index_leaves:
                    if ordered_thresholds[len(indices_to_sort)-1]<=lower_bound_leaf[tree_index][leaf,feat]:
                        incl_leaf.append(leaf)
                    
            m.addConstr(last_var[feat] >= quicksum(in_leaf_var[leaf,tree_index] for leaf in incl_leaf))
            
    m.update()
    
    return feature_var,m


In [None]:

def multilinear_formulation(estimators,num_total_features,feature_upper_bounds,feature_lower_bounds,x,eps,deviation_bound=False,make_relax=False):

    m = Model("RF")
    m.setParam('TimeLimit', 1800)
#     m.setParam('TimeLimit', 60)
    num_trees=len(estimators)

    print("num_total_features",num_total_features)

    #initialise variables
    in_leaf_var={}
    for tree_index,tree_in_forest in enumerate(estimators):
        print('tree_index',tree_index)
        tree=tree_in_forest.tree_
        index_leaves=np.array(get_leaf_indices(tree))
        for leaf in index_leaves:
            in_leaf_var[leaf,tree_index]=m.addVar(name="in_leaf_var"+str(leaf)+"tree_index"+str(tree_index))

        m.addConstr(quicksum(in_leaf_var[leaf,tree_index] for leaf in index_leaves)==1)
    
    feature_var,m=make_continuous_variable_multi(estimators,m,in_leaf_var,num_total_features,eps,feature_lower_bounds,feature_upper_bounds)
    m.update()
    
    if deviation_bound==True:
        m=add_deviation_bound(m,feature_var,x,num_total_features)
    
    # Summing objective over all trees
    m.setObjective(quicksum(quicksum(in_leaf_var[leaf,tree_index]*tree_in_forest.tree_.value[leaf,0,0] for leaf in np.array(get_leaf_indices(tree_in_forest.tree_))) for tree_index,tree_in_forest in enumerate(estimators))/float(num_trees),GRB.MAXIMIZE)
    m.write("mult_model_file.lp")
    
    leaf_out=np.ones([num_trees])
    feature_var_out=np.ones([num_total_features])

    if make_relax==True:
        relax = m.relax()
        relax.optimize()
        obj = relax.getObjective()
    else:
        m.optimize()

        obj = m.getObjective()
        print("mult obj",obj.getValue())

        for tree_index,tree_in_forest in enumerate(estimators):
            tree=tree_in_forest.tree_
            index_leaves=np.array(get_leaf_indices(tree))
            for i in index_leaves:
                if m.getVarByName("in_leaf_var"+str(i)+"tree_index"+str(tree_index)).x==1:
                    leaf_out[tree_index]=i
    
    print('leaf_out',leaf_out)
    
    return feature_var_out,obj.getValue(),leaf_out, m.Runtime



In [None]:

deviation_bound=False
make_continuous=False
dataset= 'abs_synth_noisy'
eps=0.0001
n_data = 5000
n_rep=1
make_relax=False

n_feature_set=[5]
trees_to_test=[1,2,4,8]
time_taken_proj=np.ones([len(trees_to_test)])
time_taken_big_M=np.ones([len(trees_to_test)])

time_misic=np.ones((len(n_feature_set),len(trees_to_test),n_rep))
time_proj=np.ones((len(n_feature_set),len(trees_to_test),n_rep))
time_bigm=np.ones((len(n_feature_set),len(trees_to_test),n_rep))
time_mult=np.ones((len(n_feature_set),len(trees_to_test),n_rep))


for j,n_feature in enumerate(n_feature_set):
    for i,tree_size in enumerate(trees_to_test):
        for k in range(n_rep):
            x,y,feature_upper_bounds,feature_lower_bounds,n_data,n_feature=choose_dataset(dataset,n_data,n_feature)
            

            print("fitting tree")
            clf = RandomForestRegressor(n_estimators=tree_size,max_depth=8,random_state=1)
            clf.fit(x, y)
            print("tree fit \n")

            num_leaves=[]
            depth=[]
            for tree_index,tree_in_forest in enumerate(clf.estimators_):

                tree=tree_in_forest.tree_
                #pprint(getmembers(tree))
                #print()
                num_leaves.append(np.sum(tree.children_left== (-1)))
                depth.append(tree.max_depth)

            total_leaves=np.sum(num_leaves)
            total_depth=np.sum(depth)

            print("total_leaves",total_leaves)
            print("total_depth",total_depth)

            print("\n start velibor")
            misic_features,misic_obj,misic_leaf=misic_rf_unconstrained(clf.estimators_,n_feature,feature_upper_bounds,feature_lower_bounds,x,eps,make_continuous,deviation_bound)
            print("finish velibor \n")
            print('misic_leaf',misic_leaf)
            
            print("\n start mlo")
            start_time_mult = timeit.default_timer()
            mlo_features,mlo_obj,mlo_leaf,mlo_time=multilinear_formulation(clf.estimators_,n_feature,feature_upper_bounds,feature_lower_bounds,x, eps,deviation_bound)
            end_time_mult = timeit.default_timer()

            print("\n start proj")
            start_time_proj = timeit.default_timer()
            selected_jury_features_proj,opt_in_sample_scores_IP,proj_leaf,proj_time=projected_extended_random_forest_IP_unconstrained(clf.estimators_,n_feature,feature_upper_bounds,feature_lower_bounds,x, eps,deviation_bound)
            end_time_proj = timeit.default_timer()
            
            print("\n start misic")
            start_time_misic = timeit.default_timer()
            misic_opt(clf.estimators_,n_feature,feature_upper_bounds,feature_lower_bounds,x, eps,make_continuous,deviation_bound,make_relax)
            end_time_misic = timeit.default_timer()
            
            print("\n start bigm")
            start_time_bigm = timeit.default_timer()
            tighter_random_forest_IP_unconstrained(clf.estimators_,n_feature,make_relax)
            end_time_bigm = timeit.default_timer()
            
            print("\n start cuts")
            obj,leaf, time_misic_cuts[j,i,k]=misic_opt_tighter_with_cuts_2(clf.estimators_,n_feature,feature_upper_bounds,feature_lower_bounds,x, eps,make_continuous,deviation_bound,make_relax)


            time_proj[j,i,k]=end_time_proj-start_time_proj
            time_misic[j,i,k]=end_time_misic-start_time_misic
            time_bigm[j,i,k]=end_time_bigm-start_time_bigm
            time_mult[j,i,k]=mlo_time
       

In [None]:
trees_tested=np.ones((len(n_feature_set),len(trees_to_test),n_rep))
n_feature_tested=np.ones((len(n_feature_set),len(trees_to_test),n_rep))

for j,n_feature in enumerate(n_feature_set):
    for i,tree_size in enumerate(trees_to_test):
        for k in range(n_rep):
            trees_tested[j,i,k]=tree_size
            n_feature_tested[j,i,k]=n_feature
            
d = {'mlo': time_mult.flatten(),'projected': time_proj.flatten(), 'misic': time_misic.flatten(), 'bigM': time_bigm.flatten(),'cuts': time_misic_cuts.flatten(),'num_trees':trees_tested.flatten(),'n_feature':n_feature_tested.flatten()}
df = pd.DataFrame(data=d)

df.to_csv("output/synth_sims_plus_cuts_mult.csv")

In [None]:
for j,n_feature in enumerate(n_feature_set):

    d = {'projected': time_proj[j,0:4,:].flatten(), 'misic': time_misic[j,0:4,:].flatten(), 
         'bigM': time_bigm[j,0:4,:].flatten(),'elbow': time_misic_cuts[j,0:4,:].flatten(),'mlo': time_mult[j,0:4,:].flatten(),'num_trees':trees_tested[j,0:4,:].flatten()}
    df = pd.DataFrame(data=d)

    means=df.groupby(['num_trees'],as_index=False).mean()
    stds=df.groupby(['num_trees'],as_index=False).std()
    temp=pd.melt(means, id_vars =['num_trees'])
    temp['std_dev']=pd.melt(stds, id_vars =['num_trees']).value

    fig = line(
        data_frame = temp,
        x = 'num_trees',
        y = 'value',
        error_y = 'std_dev',
        error_y_mode = 'band',
        color = 'variable',
        title = '',
        markers = '.',
    )
    yaxis_title="Y Axis Title",
    fig.update_yaxes(type="log")
    fig.update_xaxes(type="log")
    fig.update_layout(yaxis_title="Solve time (s)",width=500, height=500)
    fig.update_layout(legend={'title_text':''})
    fig.update_layout(legend=dict(orientation = "h", yanchor="top",y=1.15))
    fig.show()
    fig.write_image("plots/synth_with_cuts_feat_"+str(n_feature)+"mlo.pdf")
    
    