In [3]:
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 18 14:57:57 2020

@author: oxenb
"""
import pandas as pd 
import json
import xgboost as xgb
from sklearn.metrics import roc_auc_score,accuracy_score
import matplotlib.pyplot as plt
import shap


###global



##read mesure

def TranformReslut(measuers_input,metaFeatures):
    """calculate the wining algo for each dataset from measuers_input table and add 2 columns to metaFeatures table.
       The first indicate the model used and the second column indicate if he got the best score with bool value. 
        
        ----------
        measuers_input : {dataFrame}  measures from the CompereAlgo moudle
        
        metaFeatures : {dataFrame}  meta features table 
        
        Returns
        -------

            
    """
    #create table of who won in each dataset
    winAlgo= measuers_input.iloc[measuers_input.groupby(['Dataset_Name'])['AUC'].idxmax()][["Dataset_Name","AlgoName"]]
    winAlgo["win"] = True
    scores = measuers_input[["Dataset_Name","AlgoName"]].drop_duplicates()
    winAlgo = pd.merge(winAlgo, scores, on = ["Dataset_Name","AlgoName"], how = 'outer').fillna(False)
    
    #filtter datasets that are not in metaFeatures table
    metaFeatures = metaFeatures[metaFeatures['Dataset_Name'].isin(  winAlgo['Dataset_Name'].unique())]
    
    winAlgo.to_csv("../data/results/winAlgo.csv")
    
    #create the metadata table, combination of metaFeatures with the win algo as binary
    metadata = pd.merge(winAlgo, metaFeatures, on='Dataset_Name', how='left')
    
    #check mismatched after merge
    if metadata["Dataset_Name"].isnull().values.any():
        raise Exception("Sorry... need to fix mismatched columns")
    
    return metadata




def ReadResults():
    """read results from CompereAlgo moudle and change dataset name column name to 'Dataset_Name'.
       read meta features table
       Fix mismatched columns bettwen meta features table and measuers_input.
        
        ----------
        measuers_input : {dataFrame}  measures from the CompereAlgo moudle
        
        metaFeatures : {dataFrame}  meta features table 
        
        Returns
        -------

            
    """
    #read measuers_input
    measuers_input = pd.read_csv('../data/results/measuers.csv')
    measuers_input["HP_vals"] = measuers_input["HP_vals"].apply(lambda x : json.loads(x.replace("\'", "\"")))
    
    #read metaFeatures
    metaFeatures = pd.read_csv('../data/ClassificationAllMetaFeatures.csv')
    
    #change dataset column name to be the same
    columns = list(metaFeatures.columns)
    columns[0] = 'Dataset_Name'
    metaFeatures.columns = columns
    #fix diff in dataset names on both tables
    metaFeatures.replace("abalone","abalon",inplace = True)
    

    
    return TranformReslut(measuers_input,metaFeatures)
    




##preprocess
def preprocess(metadata):
    """clean null values and columns with same values, activate one hot encoder on the algo name column
        
        ----------
        metadata : {dataFrame} 
                
        Returns
        -------
        metadata : {dataFrame} 
            
    """
    #remove all nan columns
    metadata = metadata.dropna(how = 'all', axis = 1)
    #remove all same columns
    nunique = metadata.apply(pd.Series.nunique)
    colsToDrop = nunique[nunique == 1].index
    metadata = metadata.drop(colsToDrop, axis=1)
    
    df_ohe_features = pd.get_dummies(metadata["AlgoName"],prefix = "AlgoName" ,columns = "AlgoName" )
    
    metadata = pd.concat([metadata,df_ohe_features],axis =1 )
    metadata.drop("AlgoName", inplace=True,axis = 1)
    return metadata


def createDataForModel(data):
    """convert dataframe to array and create the label array
        
        ----------
        data : {dataFrame} 
        Returns
        -------
        X,y : {array like} 
        
    """
    y = data.pop("win").values
    X = data.values
    X = X[:,1:]
    return X,y
    
def calcMeasures(metadata):   
    """calculate the xgboost model over all the datasets with leave on out and get the measures and export to csv
        
        ----------
        metadata : {dataFrame} 
        Returns
        -------
        metadata : {dataFrame} 
        
    """
    allDatasets = metadata["Dataset_Name"].unique()
    measuers = pd.DataFrame(columns = ["dataset_name","ACC","AUC","importance_cover","importance_gain","importance_weight","shap"])
    index = 0
    #leave one out
    for dataset in allDatasets:
        metadataTest = metadata[ metadata["Dataset_Name"] == dataset]
        metadataTrain = metadata[ metadata["Dataset_Name"] != dataset]
        
        X_train,y_train = createDataForModel(metadataTrain)
        X_test,y_test = createDataForModel(metadataTest)
        ##model
        xgb_model = xgb.XGBClassifier()

        xgb_model.fit(X_train,y_train)
        y_pred = xgb_model.predict(X_test)
        y_pred_proba = xgb_model.predict_proba(X_test)
        
        
        ACC = accuracy_score(y_test,y_pred)
        AUC = roc_auc_score(y_test, y_pred_proba[:,1], average = 'macro')
        importance_cover = xgb_model.get_booster().get_score(importance_type= "cover")
        importance_gain = xgb_model.get_booster().get_score(importance_type= "gain")
        importance_weight = xgb_model.get_booster().get_score(importance_type= "weight")
        booster = xgb_model.get_booster()
        shapVal = booster.predict(xgb.DMatrix(X_test), pred_contribs=True)
        measuers.loc[index] = [dataset,ACC,AUC,importance_cover,importance_gain,importance_weight,shapVal]
        index +=1


        measuers.to_csv("../data/results/measuers_meta.csv")
        measuers.to_pickle('../data/results/measuers_meta_pickle.csv')
    
    #claclulate importances
    X,y = createDataForModel(metadata)
    ##model
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(X,y)
    importance_types = ["cover","gain","weight"]
    #crete plots of importance types amd save them
    for importance in importance_types:
        ax = xgb.plot_importance(xgb_model.get_booster(), importance_type=importance,max_num_features = 10,show_values = False,title = f'importance by {importance}')
        ax.figure.savefig(f'../figures/importances_meta/{importance}.png')
    #shap
    fig = plt.figure()
    mybooster = xgb_model
    explainer = shap.TreeExplainer(mybooster)
    shap_values = explainer.shap_values(X)
    shap.summary_plot(shap_values, X, plot_type="bar")

    
    
def runMetaclassifier():
    """main for this moudle
        
        ----------
                
        Returns
        -------
            
    """
    metadata = ReadResults() 

    metadata = preprocess(metadata)

    
    calcMeasures(metadata)



Unnamed: 0,Dataset_Name,AlgoName,win,f1,f2,f3,f4,f5,f6,f7,...,instances,dimensionality,instances_with_missing_val,ratio_of_discrete_features,ratio_of_numeric_features,nonzero_vals_cnt,missing values,attributes total,attributes categorical,attributes numerical
0,abalon,RF_baseClassfier,True,0.651704,0.946056,0.991593,0.209084,0.838805,0.248976,0.644461,...,4177.0,0.002155,0.0,0.0,1.0,37593.0,0.0,9.0,0.0,9.0
1,acute-inflammation,infiboost,True,-0.438077,0.733125,0.912447,0.075615,0.292888,-0.684230,0.338055,...,120.0,0.058333,0.0,0.0,1.0,840.0,0.0,7.0,0.0,7.0
2,acute-nephritis,infiboost,True,-0.668322,0.911725,0.999976,-0.835612,0.911701,-0.169281,-0.169281,...,120.0,0.066667,0.0,0.0,1.0,960.0,0.0,8.0,0.0,8.0
3,analcatdata_asbestos,infiboost,True,-0.014054,0.853795,0.944905,-0.750590,0.603987,-0.302527,0.476077,...,83.0,0.048193,0.0,0.0,1.0,332.0,0.0,4.0,0.0,4.0
4,analcatdata_boxing1,KTBoost,True,0.138858,0.974413,0.994025,-0.885632,0.922272,-0.162542,0.961512,...,120.0,0.033333,0.0,0.0,1.0,480.0,0.0,4.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,white-clover,RF_baseClassfier,False,-0.910279,0.926851,0.993169,-0.926543,-0.706208,-0.215606,-0.215606,...,63.0,0.507937,0.0,0.0,1.0,2016.0,0.0,32.0,0.0,32.0
596,wine-quality-red,RF_baseClassfier,False,,,,,,,,...,,,,,,,,,,
597,wine,RF_baseClassfier,False,-0.944565,0.969238,0.994326,-0.898982,0.264544,-0.834253,0.857846,...,1599.0,0.007505,0.0,0.0,1.0,19188.0,0.0,12.0,0.0,12.0
598,yeast,RF_baseClassfier,False,-0.859968,0.962247,0.997648,-0.988930,-0.535747,-0.859820,0.944770,...,1484.0,0.006065,0.0,0.0,1.0,13356.0,0.0,9.0,0.0,9.0
