In [31]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.use("pgf")

from glob import glob

from dataset import *

# This is a hack to be able to import modules in parent directory
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
plt.rcParams.update({
    "font.family": "serif",  # use serif/main font for text elements
    "text.usetex": True,     # use inline math for ticks
    "pgf.rcfonts": False,     # don't setup fonts from rc parameters
})


In [38]:
datasets = {
            0: deepmatcher_structured_amazon_google,
            4: deepmatcher_structured_walmart_amazon,
            5: deepmatcher_textual_abt_buy,
            1: deepmatcher_structured_dblp_acm,
            2: deepmatcher_structured_dblp_google_scholar,
            #3: deepmatcher_structured_itunes_amazon,
}

def save_plots(models=['.'], specific=None, x='labeled_instances', y='test_f1', experiments=[0,1,3,5,6, 'ml'], include={'max':False}, 
               min_labeled=200, max_labeled=1000, file='results'):
    """
    Main function for saving plots based on test results
    params:
    models: list of folder names you want to plot
    specific: specify dataset by key in datasets,
    x: value to use on x-axis
    y: list of values to use on y-axis
    plots: list of predefined experiments to be included
    f1_all: True if you want a red line of maximum f1 score in graph
    """
    exps = [f'exp{nr}' for nr in experiments]
    
    if specific is not None:
        d = datasets[specific]()
        plot_results(models, d, x, y, exps, include, min_labeled, file)
       
    else:
        for dataset in datasets.values():
            d = dataset()
            plot_results(models, d, x, y, exps, include, min_labeled, max_labeled, file)

def plot_results(models, dataset, x, y, experiments, include, min_labeled, max_labeled, file):
    scores = {}
    max_val = None
    
    for model in models:
        for filename in sorted(glob(f'{model}/out/{dataset.name}/*.csv')):
            if file in filename or model == 'ml':
                for exp in experiments:
                    if exp == 'exp0' and exp in filename:
                        df = pd.read_csv(filename)
                        scores[f'{model}_{exp}'] = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :]
                        
                        total_values = df.loc[:, 'labeled_instances'].iloc[-1]
                        scores[f'{model}_{exp}_Max#{total_values}'] = df.loc[:, y].iloc[-1] 
                        scores[f'{model}_{exp}_1/2#{total_values//2}'] = df.loc[:, y].iloc[-2]
                        scores[f'{model}_{exp}_1/4#{total_values//4}'] = df.loc[:, y].iloc[-3]

                    elif exp in ['exp4', 'exp6', 'exp55'] and exp in filename:
                        df = pd.read_csv(filename)
                        scores[f'{model}_{exp}'] = df.loc[(df['orakle_instances'] >= min_labeled) & (df['orakle_instances'] <= max_labeled), :]
                    elif exp == 'expml' and '_data' in filename:
                        df = pd.read_csv(filename)
                        df = df.rename({'f1': 'test_f1', 'precision': 'test_prec', 'recall': 'test_recall'}, axis=1)
                        df['labeled_instances'] = df.index.values
                        scores[f'{model}_{exp}'] = df.iloc[min_labeled:max_labeled]
                    elif exp in filename:
                        df = pd.read_csv(filename)
                        scores[f'{model}_{exp}'] = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :]

                        
    make_plot(scores, x, y, dataset, models, experiments, max_val, include=include, file=file)
    
def make_plot(scores, x, y, dataset, models, experiments, max_val, include, file):
    plt.figure(num=None, figsize=(10, 4), facecolor='w', edgecolor='k', dpi=200) # set size of plot
    titles = []
    
    title_mapping = {
            'exp0': 'Baseline',
            'exp1': 'Partition-4',
            'exp3': 'Hybrid',
            'exp4': 'Hybrid-Partition-2',
            'exp5': 'Uncertainty',
            'exp6': 'Partition-2',
            'exp55': 'Balanced-Uncertainty',
            'expml': 'ML-RF'
        }
    def get_title(exp):
        title = title_mapping[exp]
        return title
            
    y_mapping = {
        'test_f1': 'F1-score',
        'iteration_time': 'Iteration time',
        'test_recall': 'Recall',
        'test_prec': 'Precision',
        'train_positive_rate': 'Train Positive Rate',
        'pool_positive_rate': 'Pool Positive Rate',

    }
    def get_y_label(y):
        label = y_mapping[y]
        if file == 'std':
            label = label + ' (std)'
        return label
    
    x_mapping = {
        'labeled_instances': 'Labeled examples'
    }
    
    include_model = len(models) > 1
    
    for key,score in scores.items():
        exp = key.split('_')[-1]
        model = key.split('_')[0].split('-')[0]
        model_str = model+'_' if include_model else ''
        key_exp = key.split('_')[-1]
        if key_exp in ['exp4', 'exp6', 'exp55']:
            plt.plot(score['orakle_instances'], score[y])
            titles.append(f'{model_str}{get_title(exp)}')
        elif 'exp' in exp:  
            plt.plot(score[x], score[y])
            titles.append(f'{model_str}{get_title(exp)}')
        else:
            for metric,show in include.items():
                if metric in key:
                    if show:
                        plt.axhline(y=score, color='r', linestyle='--') # plt.axhline(y=score, color='lightgray', linestyle='--')
                        titles.append(f"{model_str}Baseline-{metric} ({key.split('#')[1]})")

    #plt.xticks(scores[list(scores.keys())[-1]][x])  # set values of points on x axis
    plt.margins(0.01) # set margins to 0.01
    plt.legend(titles, loc=4) # add titles of plots, 4 = lower right, 2 = upper left
    title = dataset.name.split('/')[-1]
    plt.title(title)
    plt.xlabel(x_mapping[x])
    plt.ylabel(get_y_label(y))
    
    if y == 'pool_positive_rate': # fix y axis to start at 0.
        x1,x2,y1,y2 = plt.axis()  
        plt.axis((x1,x2,0,y2))

    try:
        path = os.path.join('graps', f'{"_".join(models)}', f'{"_".join(experiments)}')
        os.makedirs(path)
    except OSError as e:
        pass
    
    plt.savefig(f'{path}/{title}_{y}.pdf', format='pdf', bbox_inches='tight')
    plt.close()

In [None]:
# Introduction
save_plots(models=['roberta-2504'], x='labeled_instances', y='test_f1', experiments=[0,4,6], 
           include={'Max':True, '1/2':True, '1/4':False}, 
           min_labeled=200, max_labeled=1000, 
           file='results')

In [45]:
# Extended
save_plots(models=['roberta-2504', 'ml'], x='labeled_instances', y='test_f1', experiments=[0,4,6, 'ml'], 
           include={'Max':True, '1/2':True, '1/4':True}, 
           min_labeled=1, max_labeled=1000, 
           file='results')

In [25]:
# F1
save_plots(models=['roberta-2504'], x='labeled_instances', y='test_f1', experiments=[0,1,3,4,5,6], 
           include={'Max':True, '1/2':False, '1/4':False}, 
           min_labeled=200, max_labeled=1000, 
           file='results')

In [29]:
# STD
save_plots(models=['roberta-2504'], x='labeled_instances', y='test_f1', experiments=[0,1,3,4,5,6], include={}, min_labeled=200, max_labeled=1000, file='std')

In [30]:
# Iteration time
save_plots(models=['roberta-2504'], x='labeled_instances', y='iteration_time', experiments=[0,1,3,4,5,6], min_labeled=200, max_labeled=1000, file='results')

In [42]:
# Train positive rate
save_plots(models=['roberta-2504'], x='labeled_instances', y='train_positive_rate', experiments=[0,4,6,5], specific=None, min_labeled=200, max_labeled=1000, file='results')

In [37]:
# Pool positive rate
save_plots(models=['roberta-2504'], x='labeled_instances', y='pool_positive_rate', experiments=[0,4,6], specific=None, min_labeled=200, max_labeled=1000, file='results')

In [165]:
# Recall
save_plots(models=['roberta-2504'], x='labeled_instances', y='test_recall', experiments=[0,4,6,1,3,5], specific=None, min_labeled=200, max_labeled=1000, file='results')

In [167]:
# Precision

save_plots(models=['roberta-2504'], x='labeled_instances', y='test_prec', experiments=[0,4,6,1,3,5], specific=None, min_labeled=200, max_labeled=1000, file='results')

In [43]:
# F1 small
save_plots(models=['roberta-2504'], x='labeled_instances', y='test_f1', experiments=[0,4,6], 
           include={'Max':True, '1/2':False, '1/4':False}, 
           min_labeled=200, max_labeled=1000, 
           file='results')

In [163]:
# DistilBERT vs RoBERTa -f1
save_plots(models=['roberta-2504','distilbert-0305'], x='labeled_instances', y='test_f1', experiments=[0,6], specific=None, min_labeled=1, max_labeled=1000, file='results')

In [164]:
# DistilBERT vs RoBERTa - time
save_plots(models=['roberta-2504','distilbert-0305'], x='labeled_instances', y='iteration_time', experiments=[0,6], specific=None, min_labeled=1, max_labeled=1000, file='results')

In [190]:
# To print score stats 
def print_stats(models=['.'], x='labeled_instances', y='test_f1', experiments=[0,1,3,5,6, 'ml'],
               min_labeled=200, max_labeled=1000, file='results'):
   
    exps = [f'exp{nr}' for nr in experiments]
    scores = []
    
    for dataset in datasets.values():
        d = dataset()
        s = print_results(models, d, x, y, exps, min_labeled, max_labeled, file)
        scores.append(s)
    return scores

def print_results(models, dataset, x, y, experiments, min_labeled, max_labeled, file):
    scores = {}
    
    max_val = None
    
    for model in models:
        print('model', dataset.name)
        for filename in sorted(glob(f'{model}/out/{dataset.name}/*.csv')):
            if file in filename:
                for exp in experiments:
                    if exp == 'exp0' and exp in filename:
                        df = pd.read_csv(filename)
                        # select first and last row of f1 score
                        ds = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :].iloc[[0,-1]]
                        scores[f'{dataset.name}{model}_{exp}'] = ds.loc[:, 'test_f1']
                        
                        total_values = df.loc[:, 'labeled_instances'].iloc[-1]
                        #scores[f'{model}_{exp}_Max#{total_values}'] = df.loc[:, y].iloc[-1] 
                        #scores[f'{model}_{exp}_1/2#{total_values//2}'] = df.loc[:, y].iloc[-2]
                        #scores[f'{model}_{exp}_1/4#{total_values//4}'] = df.loc[:, y].iloc[-3]
                    elif exp in ['exp4', 'exp6'] and exp in filename:
                        df = pd.read_csv(filename)
                        ds = df.loc[(df['orakle_instances'] >= min_labeled) & (df['orakle_instances'] <= max_labeled), :].iloc[[0,-1]]

                        scores[f'{dataset.name}{model}_{exp}'] = ds.loc[:, 'test_f1']
                    elif exp in filename:
                        df = pd.read_csv(filename)
                        ds = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :].iloc[[0,-1]]
                        scores[f'{dataset.name}{model}_{exp}'] = ds.loc[:, 'test_f1']
    return scores


In [None]:
scores = []
scores = print_stats(models=['roberta-2504'], x='labeled_instances', y='test_f1', experiments=[0,1,3,4,5,6], 
           min_labeled=200, max_labeled=1000, 
           file='results')
for dataset in scores:
    print('###\n')
    for k,v in dataset.items():
        print(k)
        print(v)

In [192]:
# toLatex

# To print score stats 
def print_stats2(models=['.'], x='labeled_instances', y='test_f1', experiments=[0,1,3,5,6, 'ml'],
               min_labeled=200, max_labeled=1000, file='results'):
   
    exps = [f'exp{nr}' for nr in experiments]
    scores = []
    
    for dataset in datasets.values():
        d = dataset()
        s = print_results2(models, d, x, y, exps, min_labeled, max_labeled, file)
        scores.append(s)
    return scores

def print_results2(models, dataset, x, y, experiments, min_labeled, max_labeled, file):
    scores = {}
    
    max_val = None
    
    for model in models:
        print('model', dataset.name)
        for filename in sorted(glob(f'{model}/out/{dataset.name}/*.csv')):
            if file in filename:
                for exp in experiments:
                    if exp == 'exp0' and exp in filename:
                        df = pd.read_csv(filename)
                        # select first and last row of f1 score
                        ds = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :]
                        scores[f'{dataset.name}{model}_{exp}'] = ds.loc[:, y]
                        
                        total_values = df.loc[:, 'labeled_instances'].iloc[-1]
                        #scores[f'{model}_{exp}_Max#{total_values}'] = df.loc[:, y].iloc[-1] 
                        #scores[f'{model}_{exp}_1/2#{total_values//2}'] = df.loc[:, y].iloc[-2]
                        #scores[f'{model}_{exp}_1/4#{total_values//4}'] = df.loc[:, y].iloc[-3]
                    elif exp in ['exp4', 'exp6'] and exp in filename:
                        df = pd.read_csv(filename)
                        ds = df.loc[(df['orakle_instances'] >= min_labeled) & (df['orakle_instances'] <= max_labeled), :]

                        scores[f'{dataset.name}{model}_{exp}'] = ds.loc[:, y]
                    elif exp in filename:
                        df = pd.read_csv(filename)
                        ds = df.loc[(df['labeled_instances'] >= min_labeled) & (df['labeled_instances'] <= max_labeled), :]
                        scores[f'{dataset.name}{model}_{exp}'] = ds.loc[:, y]
    return scores


In [193]:
scores = []
scores = print_stats2(models=['roberta-2504'], x='labeled_instances', y='test_f1', experiments=[0,1,3,4,5,6], 
           min_labeled=200, max_labeled=1000, 
           file='results')
for dataset in scores:
    print('\n\n\n###\n')
    print(dataset)
    print(pd.DataFrame(dataset).to_latex())

model DeepMatcher/Structured/Amazon-Google
model DeepMatcher/Structured/Walmart-Amazon
model DeepMatcher/Textual/Abt-Buy
model DeepMatcher/Structured/DBLP-ACM
model DeepMatcher/Structured/DBLP-GoogleScholar



###

{'DeepMatcher/Structured/Amazon-Googleroberta-2504_exp0': 10    0.249
11    0.406
12    0.476
13    0.393
14    0.487
15    0.425
16    0.503
17    0.518
18    0.327
19    0.446
20    0.498
21    0.424
22    0.544
23    0.535
24    0.501
25    0.556
26    0.530
27    0.535
28    0.554
29    0.386
30    0.591
Name: test_f1, dtype: float64, 'DeepMatcher/Structured/Amazon-Googleroberta-2504_exp1': 0     0.249
1     0.385
2     0.526
3     0.550
4     0.551
5     0.590
6     0.581
7     0.604
8     0.616
9     0.633
10    0.665
Name: test_f1, dtype: float64, 'DeepMatcher/Structured/Amazon-Googleroberta-2504_exp3': 10    0.528
11    0.560
12    0.579
13    0.577
14    0.588
15    0.578
16    0.583
17    0.567
18    0.599
19    0.603
20    0.604
21    0.645
22    0.639
23    0.662