In [1]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use("pgf")

from glob import glob

from dataset import *

# This is a hack to be able to import modules in parent directory
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
plt.rcParams.update({
    "font.family": "serif",  # use serif/main font for text elements
    "text.usetex": True,     # use inline math for ticks
    "pgf.rcfonts": False,     # don't setup fonts from rc parameters
})


In [5]:
dp = pd.DataFrame({'a': [1,2,3,4], 'b': [3,4,5,4]})

In [9]:
dp.iloc[(0+1):3]

Unnamed: 0,a,b
1,2,4
2,3,5


In [40]:
datasets = {
            0: deepmatcher_structured_amazon_google,
            4: deepmatcher_structured_walmart_amazon,
            5: deepmatcher_textual_abt_buy,
            1: deepmatcher_structured_dblp_acm,
            2: deepmatcher_structured_dblp_google_scholar,
            #3: deepmatcher_structured_itunes_amazon,
}

def save_plots(models=['.'], specific=None, x='labeled_instances', y='test_f1', experiments=[0,1,3,5,6, 'ml'], include={'max':False}, min_labeled=200, max_labeled=1000, file='results'):
    """
    Main function for saving plots based on test results
    params:
    models: list of folder names you want to plot
    specific: specify dataset by key in datasets,
    x: value to use on x-axis
    y: list of values to use on y-axis
    plots: list of predefined experiments to be included
    f1_all: True if you want a red line of maximum f1 score in graph
    """
    exps = [f'exp{nr}' for nr in experiments]
    
    if specific is not None:
        d = datasets[specific]()
        plot_results(models, d, x, y, exps, include, min_labeled, file)
       
    else:
        for dataset in datasets.values():
            d = dataset()
            plot_results(models, d, x, y, exps, include, min_labeled, max_labeled, file)

def plot_results(models, dataset, x, y, experiments, include, min_labeled, max_labeled, file):
    scores = {}
    max_val = None
    
    for model in models:
        for filename in sorted(glob(f'{model}/out/{dataset.name}/*.csv')):
            if file in filename or model == 'ml':
                for exp in experiments:
                    if exp == 'exp0' and exp in filename:
                        df = pd.read_csv(filename)
                        scores[f'{model}_{exp}'] = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :]
                        
                        total_values = df.loc[:, 'labeled_instances'].iloc[-1]
                        scores[f'{model}_{exp}_Max#{total_values}'] = df.loc[:, y].iloc[-1] 
                        scores[f'{model}_{exp}_1/2#{total_values//2}'] = df.loc[:, y].iloc[-2]
                        scores[f'{model}_{exp}_1/4#{total_values//4}'] = df.loc[:, y].iloc[-3]

                    elif exp == 'exp6' and exp in filename:
                        df = pd.read_csv(filename)
                        scores[f'{model}_{exp}'] = df.loc[(df['orakle_instances'] >= min_labeled) & (df['orakle_instances'] <= max_labeled), :]
                    elif exp == 'expml' and '_data' in filename:
                        df = pd.read_csv(filename)
                        df = df.rename({'f1': 'test_f1', 'precision': 'test_prec', 'recall': 'test_recall'}, axis=1)
                        df['labeled_instances'] = df.index.values
                        scores[f'{model}_{exp}'] = df.iloc[min_labeled:max_labeled]
                    elif exp in filename:
                        df = pd.read_csv(filename)
                        scores[f'{model}_{exp}'] = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :]

                        
    make_plot(scores, x, y, dataset, models, experiments, max_val, include=include)
    
def make_plot(scores, x, y, dataset, models, experiments, max_val, include):
    plt.figure(num=None, figsize=(10, 4), facecolor='w', edgecolor='k', dpi=200) # set size of plot
    titles = []
    
    title_mapping = {
            'exp0': 'Baseline',
            'exp1': 'Partition-4',
            'exp3': 'Hybrid',
            'exp4': 'Hybrid-Partition-2',
            'exp5': 'Uncertainity',
            'exp6': 'Partition-2',
            'exp55': 'Balanced-Uncertainity',
            'expml': 'ML'
        }
    y_mapping = {
        'test_f1': 'F1-score',
        'iteration_time': 'Iteration time'
    }
    x_mapping = {
        'labeled_instances': 'Labeled examples'
    }
    
    for key,score in scores.items():
        exp = key.split('_')[-1]
        if 'exp6' in key:
            plt.plot(score['orakle_instances'], score[y])
            titles.append(f'{title_mapping[exp]}')
        elif 'exp' in exp:  
            plt.plot(score[x], score[y])
            titles.append(f'{title_mapping[exp]}')
        else:
            for metric,show in include.items():
                if metric in key:
                    if show:
                        plt.axhline(y=score, color='r', )
                        titles.append(f"Baseline-{metric} ({key.split('#')[1]})")
    
    #if include_max and max_val:
    #    plt.axhline(y=max_val, color='r')
    #    titles.append(f'max')

    #plt.xticks(scores[list(scores.keys())[-1]][x])  # set values of points on x axis
    plt.margins(0.01) # set margins to 0.01
    plt.legend(titles, loc=4) # add titles of plots, 4 = lower right, 2 = upper left
    title = dataset.name.split('/')[-1]
    plt.title(title)
    plt.xlabel(x_mapping[x])
    plt.ylabel(y_mapping[y])

    try:
        path = os.path.join('graps', f'{"_".join(models)}', f'{"_".join(experiments)}')
        os.makedirs(path)
    except OSError as e:
        pass
    
    plt.savefig(f'{path}/{title}_{y}.pdf', format='pdf', bbox_inches='tight')
    plt.close()

In [42]:
# test_loss, test_f1, test_prec, test_recall, labeled_instances, orakle_instances, iteration_time, train_positive_rate, pool_positive_rate
save_plots(models=['roberta-2504', 'ml'], x='labeled_instances', y='test_f1', experiments=[0,3,6,'ml'], 
           include={'Max':True, '1/2':False, '1/4':False}, 
           min_labeled=1, max_labeled=1000, 
           file='results')

In [4]:
save_plots(models=['roberta-2504'], x='labeled_instances', y='test_f1', experiments=[0,1,3,4,5,6], include={}, min_labeled=200, max_labeled=1000, file='std')

In [5]:
save_plots(models=['roberta-2504'], x='labeled_instances', y='iteration_time', experiments=[0,1,3,4,5,6], min_labeled=200, max_labeled=1000, file='results')

In [13]:
save_plots(models=['roberta-2504'], x='labeled_instances', y='train_positive_rate', experiments=[0,4,6], specific=None, min_labeled=200, max_labeled=1000, file='results')

In [15]:
save_plots(models=['roberta-2504'], x='labeled_instances', y='pool_positive_rate', experiments=[0,4,6], specific=None, min_labeled=1, max_labeled=1000, file='results')

In [9]:
# distilbert-0305
save_plots(models=['distilbert-0305'], x='labeled_instances', y='test_f1', experiments=[0,55,6], specific=None, min_labeled=1, max_labeled=1000, file='results')