In [None]:
%cd ..
%pwd

In [None]:
from experiments.modeleval_utils import *
from sklearn.model_selection import cross_validate

In [None]:
# no feature selection
#output_dir = 'experiments/results'
#drop_columns = False # feature selection

# feature selection
output_dir = 'experiments/results_FS'
drop_columns = True 

## Training all models with best hyperparameters

In [None]:
def evaluate_pipeline(output_dir, model, target, data, feature_type, scoring, X_train, y_train, X_test, y_test):
    evaluation = {'model': model}

    pipeline, best_params, best_result = get_pipeline(output_dir, data, feature_type, target, scoring, model)

    y_test_proportion = len(y_test) / (len(y_train) + len(y_test))
    tscv = TimeSeriesSplit(n_splits=5, test_size=round(len(y_train) * y_test_proportion))
    res = cross_validate(pipeline, X_train, y_train, scoring=['average_precision', 'roc_auc'], cv=tscv, n_jobs=5)
    evaluation['validation'] = res
    
    pipeline.fit(X_train, y_train)
    evaluation['fitted_pipeline'] = pipeline
    evaluation['best_params'] = best_params
    evaluation['best_result'] = best_result

    threshold_train, f1_train = get_best_f1_threshold(pipeline, X_train, y_train)
    threshold_test, f1_test = get_best_f1_threshold(pipeline, X_test, y_test)
    print(f'{threshold_train=} {f1_train=}, {threshold_test=}, {f1_test=}')

    for (split, X_, y_, threshold) in [
        ('train', X_train, y_train, threshold_train),
        ('test', X_test, y_test, threshold_train),
        ('test_pareto', X_test, y_test, threshold_test)
        ]:

        y_score = get_y_score(pipeline, X_)
        y_pred = y_score >= threshold

        evaluation[split] = {
            'average_precision': metrics.average_precision_score(y_, y_score),
            'roc_auc': metrics.roc_auc_score(y_, y_score),

            'threshold': threshold,
            
            'regression': {
                'recall': metrics.recall_score(y_, y_pred, pos_label=1),
                'precision': metrics.precision_score(y_, y_pred, pos_label=1, zero_division=0),
                'f1': metrics.f1_score(y_, y_pred, pos_label=1)
            },
            'non-regression': {
                'recall': metrics.recall_score(y_, y_pred, pos_label=0),
                'precision': metrics.precision_score(y_, y_pred, pos_label=0, zero_division=0),
                'f1': metrics.f1_score(y_, y_pred, pos_label=0)
            },
            'avg_weighted': {
                'recall': metrics.recall_score(y_, y_pred, average='weighted'),
                'precision': metrics.precision_score(y_, y_pred, average='weighted', zero_division=0),
                'f1': metrics.f1_score(y_, y_pred, average='weighted')
            },
            'avg_macro': {
                'recall': metrics.recall_score(y_, y_pred, average='macro'),
                'precision': metrics.precision_score(y_, y_pred, average='macro', zero_division=0),
                'f1': metrics.f1_score(y_, y_pred, average='macro')
            }
        }

    
    return evaluation

def evaluate_pipelines(output_dir, target, data, feature_type, scoring, X_train, y_train, X_test, y_test):
    evaluations = []
    for model in models:
        print(model)
        try:
            evaluation = evaluate_pipeline(output_dir, model, target, data, feature_type, scoring, X_train, y_train, X_test, y_test)
            evaluations.append(evaluation)
        except Exception as e:
            print(e)

    return evaluations


In [None]:
evaluations_by_config = {}
for (data, target) in [('fixed_defect_szz', 'performance'), ('bugbug_buglevel', 'performance'), ('bugbug_buglevel', 'regression')]:
    for feature_type in ['traditional', 'bow']:
        print('\n\n', '--> ', data, target, feature_type, '\n\n')
        scoring = 'average_precision'

        X, y, features = data_map[feature_type][data](target, drop_columns)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

        evaluations = evaluate_pipelines(output_dir, target, data, feature_type, scoring, X_train, y_train, X_test, y_test)
        evaluations_by_config[f'{data}_{target}_{feature_type}'] = evaluations

In [None]:
import pickle
with open(os.path.join(output_dir, 'evaluations.pickle'), 'wb') as f:
    pickle.dump(evaluations_by_config, f)

## Creating Tables with Model Results

In [None]:
output_dir, drop_columns

In [None]:
import pickle
with  open(os.path.join(output_dir, 'evaluations.pickle'), 'rb') as f:
    evaluations_by_config = pickle.load(f)

In [None]:
def get_hyperparam_search_stat_df(evaluation):
    columns = [
        np.array(['train', 'train', 'validation', 'validation', 'test', 'test']),
        np.array(['Avg. Precision', 'AUC', 'Avg. Precision', 'AUC', 'Avg. Precision', 'AUC']),
    ]

    df = pd.DataFrame([[
        evaluation['train']['average_precision'],
        evaluation['train']['roc_auc'],
        evaluation['validation']['test_average_precision'].mean(),
        evaluation['validation']['test_roc_auc'].mean(),
        evaluation['test']['average_precision'],
        evaluation['test']['roc_auc']
        ]],
        index=[model_names[evaluation['model']]],
        columns=columns)
    return df

In [None]:
def get_classification_report_df(evaluation):
    dfs = []
    for split in ['train', 'test', 'test_pareto']:
        df = pd.DataFrame(
            [
                evaluation[split]['regression'],
                evaluation[split]['non-regression']
                # evaluation[split]['avg_macro'],
                # evaluation[split]['avg_weighted']
            ],
            index=[
                [model_names[evaluation['model']]]*2,
                ['regression', 'non-regression']#, 'weighted average', 'micro average']
                ]
        )
        df.columns = [
            np.array([split]*3),
            np.array(['recall', 'precision', 'F1'])
        ]
        dfs.append(df)

    classification_reports = pd.concat(dfs, axis=1)

    return classification_reports

In [None]:
list(evaluations_by_config.keys())

In [None]:
config = 'bugbug_buglevel_regression_bow'
evaluations = evaluations_by_config[config]

In [None]:
ws = config.split('_')
feature_type = ws.pop()
target = ws.pop()
data = '_'.join(ws)
data, target, feature_type

In [None]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [None]:
stats = pd.concat(
    [get_hyperparam_search_stat_df(evaluation) for evaluation in evaluations],
     axis=0)
stats

In [None]:
for c in stats.columns: #[('test', 'Avg. Precision'), ('test', 'AUC')]:
    amax = stats.loc[:, c].argmax()
    stats.loc[stats.index[amax], c] = f'\\textbf{{{stats.iloc[amax][c]:.4f}}}'

In [None]:
print(stats.to_latex(escape=False))

In [None]:
classification_reports = pd.concat(
    [get_classification_report_df(evaluation) for evaluation in evaluations],
     axis=0)
classification_reports

In [None]:
for c in [('train', 'F1'), ('test', 'F1'), ('test_pareto', 'F1')]:
    reg = classification_reports.loc[pd.IndexSlice[:,'regression', :]]
    amax = reg[c].argmax()
    v = classification_reports.loc[(reg.index[amax], 'regression'), c]
    classification_reports.loc[(reg.index[amax], 'regression'), c] = f'\\textbf{{{v:.4f}}}'

In [None]:
s = classification_reports.to_latex(escape=False)
s = s.replace('Dummy Classifier', '\\multirow{2}{2.7cm}{Dummy\\\\Classifier}')
s = s.replace('\nLogistic Regression', '\\hline\\hline\n\\multirow{2}{2.7cm}{Logistic\\\\Regression}')
s = s.replace('\nSupport Vector Machine', '\\hline\\hline\n\\multirow{2}{2.7cm}{Support Vector\\\\Machine}')
s = s.replace('\nMulti-Layer Perceptron', '\\hline\\hline\n\\multirow{2}{2.7cm}{Multi-Layer\\\\Perceptron}')
s = s.replace('\nRandom Forest', '\\hline\\hline\nRandom Forest')
s = s.replace('\nXGBoost ', '\\hline\\hline\nXGBoost')
s = s.replace('\nTPOT ', '\\hline\\hline\nTPOT')
print(s)

In [None]:
X, y, features = data_map[feature_type][data](target, drop_columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

In [None]:
fitted_pipelines = [(model_names[evaluation['model']], evaluation['fitted_pipeline']) for evaluation in evaluations[1:]] # exclude dummy

In [None]:
scoring='average_precision'
plot_roc_auc_rec_prec_for_all_models(target, data, feature_type, scoring,
    fitted_pipelines, X_train, X_test, y_train, y_test, save=False, figsize=(6,4), ylim=[0, 0.3 if data == 'fixed_defect_szz' else 1.0], output_dir=output_dir)