## Imports

In [1]:
import sys
import json
import pandas as pd
sys.path.append('../')
from lib.evaluation import TextReuseEvaluator
from pathlib import Path

## Configuration

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
PASSIM_EXPERIMENTS_CONFIG_PATH = '../data/experiments_passim.json'
SERIATIM_EXPERIMENTS_CONFIG_PATH = '../data/experiments_seriatim.json'
GT_DATASET_PATH = '../data/homeric_repetitions_dataset.tsv'

In [4]:
config = json.loads(Path(PASSIM_EXPERIMENTS_CONFIG_PATH).read_text())

In [5]:
#config += json.loads(Path(SERIATIM_EXPERIMENTS_CONFIG_PATH).read_text())

## Print commands

In [6]:
for experiment in config:
    command = " ".join([
        experiment['algorithm'],
        experiment['parameters'],
        experiment['input_path'],
        experiment['output_path']
    ])
    print(experiment['label'])
    print(command)

exp6
passim -n 4 --min-match 1 -a 10 --max-repeat 100 /Users/matteo/Documents/repetitions-homeric-speeches/data/input/homeric_speeches_raw.json /Users/matteo/Documents/repetitions-homeric-speeches/data/passim/exp6/
exp12
seriatim -n 5 --src-overlap 0.8 --dst-overlap 0.8 --min-match 1 -a 10 --minDF 2 --maxDF 100 /Users/matteo/Documents/repetitions-homeric-speeches/data/input/homeric_speeches_raw.json /Users/matteo/Documents/repetitions-homeric-speeches/data/seriatim/exp12/
exp9
seriatim -n 40 --min-match 1 -a 10 --minDF 2 --maxDF 100 --all-pairs /Users/matteo/Documents/repetitions-homeric-speeches/data/input/homeric_speeches_raw.json /Users/matteo/Documents/repetitions-homeric-speeches/data/seriatim/exp9/
exp9a
seriatim -n 40 --min-match 1 -a 10 --minDF 2 --maxDF 100 --all-pairs --floating-ngrams /Users/matteo/Documents/repetitions-homeric-speeches/data/input/homeric_speeches_raw.json /Users/matteo/Documents/repetitions-homeric-speeches/data/seriatim/exp9a/
exp-new
seriatim -n 42 --min-

## Run evaluation

In [7]:
#config

In [6]:
data = []

for experiment in config:
    print(f"\n#######\n{experiment['label']}\n#######\n")
    print(experiment['algorithm'])
    print(experiment['parameters'])
    print('\n')
    evaluator = TextReuseEvaluator()
    evaluator.read_predictions(experiment['json_output_path'])
    evaluator.read_groundtruth(GT_DATASET_PATH)
    results = evaluator.evaluate()
    #evaluator.print_summary()
    results['label'] = experiment['label']
    results['algorithm'] = experiment['algorithm']
    data.append(results)


#######
conf-0
#######

passim
-n 1 --min-match 1 -a 5


7993 predicted clusters found
68 groundtruth clusters found

#######
conf-4
#######

passim
-n 2 --min-match 2 --max-repeat 100 -a 10


2078 predicted clusters found
68 groundtruth clusters found

#######
conf-5
#######

passim
-n 3 --min-match 1 --max-repeat 100 -a 10


431 predicted clusters found
68 groundtruth clusters found

#######
conf-6
#######

passim
-n 4 --min-match 1 --max-repeat 100 -a 10


341 predicted clusters found
68 groundtruth clusters found

#######
conf-7
#######

passim
-n 3 --min-match 2 --max-repeat 100 -a 10


431 predicted clusters found
68 groundtruth clusters found

#######
conf-8
#######

seriatim
-n 10 --min-match 1 -a 10 --minDF 2 --maxDF 100


266 predicted clusters found
68 groundtruth clusters found

#######
conf-8a
#######

seriatim
-n 10 --min-match 1 -a 5 --minDF 2 --maxDF 100


2019 predicted clusters found
68 groundtruth clusters found

#######
conf-8b
#######

seriatim
-n 10 --min-match 1

In [10]:
filter_columns = [
    'algorithm',
    'pct_matched_clusters',
    'pct_exact_clusters',
    'pct_partial_clusters',
    'pct_spurious_clusters',
    'precision',
    'recall',
    'f1-score'
]
eval_df = pd.DataFrame(data).set_index('label')[filter_columns].rename(columns={
    'algorithm': 'Version',
    'label':'Configuration',
    'pct_matched_clusters': 'Matched clust. (%)',
    'pct_exact_clusters': 'Exact clust. (%)',
    'pct_partial_clusters':'Partial clust. (%)',
    'pct_spurious_clusters': 'Spurious clust. (%)',
    'precision':'Precision',
    'recall': 'Recall',
    'f1-score': 'F1-score'
})

In [11]:
eval_df.index.rename('Configuration', inplace=True)

In [12]:
eval_df

Unnamed: 0_level_0,Version,Matched clust. (%),Exact clust. (%),Partial clust. (%),Spurious clust. (%),Precision,Recall,F1-score
Configuration,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
conf-0,passim,33.82,4.35,26.09,91.3,0.517,0.313,0.39
conf-4,passim,70.59,50.0,6.25,41.67,0.78,0.701,0.738
conf-5,passim,89.71,86.89,4.92,4.92,0.978,0.898,0.936
conf-6,passim,89.71,90.16,4.92,1.64,0.992,0.898,0.943
conf-7,passim,89.71,86.89,4.92,4.92,0.978,0.898,0.936
conf-8,seriatim,75.0,86.27,7.84,5.88,0.972,0.714,0.824
conf-8a,seriatim,39.71,66.67,3.7,33.33,0.695,0.388,0.498
conf-8b,seriatim,79.41,74.07,3.7,20.37,0.866,0.789,0.826
conf-8c,seriatim,77.94,77.36,3.77,16.98,0.884,0.776,0.826
conf-9,seriatim,75.0,86.27,5.88,7.84,0.964,0.728,0.829


In [13]:
pd.DataFrame(data).set_index('label').rename(columns={
    'algorithm': 'Version',
    'label':'Configuration',
    'pct_matched_clusters': 'Matched clust. (%)',
    'pct_exact_clusters': 'Exact clust. (%)',
    'pct_partial_clusters':'Partial clust. (%)',
    'pct_spurious_clusters': 'Spurious clust. (%)',
    'precision':'Precision',
    'recall': 'Recall',
    'f1-score': 'F1-score'
})

Unnamed: 0_level_0,n_predicted_clusters,n_matched_clusters,n_unmatched_clusters,n_exact_clusters,n_partial_clusters,n_spurious_clusters,n_gt_passages,n_matched_passages,n_missed_passages,n_spurious_passages,Matched clust. (%),Exact clust. (%),Partial clust. (%),Spurious clust. (%),Precision,Recall,F1-score,Version
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
conf-0,7993,23,45,1,6,21,147,46,101,43,33.82,4.35,26.09,91.3,0.517,0.313,0.39,passim
conf-4,2078,48,20,24,3,20,147,103,44,29,70.59,50.0,6.25,41.67,0.78,0.701,0.738,passim
conf-5,431,61,7,53,3,3,147,132,15,3,89.71,86.89,4.92,4.92,0.978,0.898,0.936,passim
conf-6,341,61,7,55,3,1,147,132,15,1,89.71,90.16,4.92,1.64,0.992,0.898,0.943,passim
conf-7,431,61,7,53,3,3,147,132,15,3,89.71,86.89,4.92,4.92,0.978,0.898,0.936,passim
conf-8,266,51,17,44,4,3,147,105,42,3,75.0,86.27,7.84,5.88,0.972,0.714,0.824,seriatim
conf-8a,2019,27,41,18,1,9,147,57,90,25,39.71,66.67,3.7,33.33,0.695,0.388,0.498,seriatim
conf-8b,387,54,14,40,2,11,147,116,31,18,79.41,74.07,3.7,20.37,0.866,0.789,0.826,seriatim
conf-8c,367,53,15,41,2,9,147,114,33,15,77.94,77.36,3.77,16.98,0.884,0.776,0.826,seriatim
conf-9,303,51,17,44,3,4,147,107,40,4,75.0,86.27,5.88,7.84,0.964,0.728,0.829,seriatim


## Experiment table

In [29]:
configurations_df = pd.DataFrame(config).set_index('label')[['algorithm', 'parameters']].rename(columns={
    'algorithm':'Version',
    'parameters':'Parameters'
})

In [32]:
configurations_df.index.rename('Configuration', inplace=True)

In [33]:
configurations_df

Unnamed: 0_level_0,Version,Parameters
Configuration,Unnamed: 1_level_1,Unnamed: 2_level_1
conf-0,passim,-n 1 --min-match 1 -a 5
conf-4,passim,-n 2 --min-match 2 --max-repeat 100 -a 10
conf-5,passim,-n 3 --min-match 1 --max-repeat 100 -a 10
conf-6,passim,-n 4 --min-match 1 --max-repeat 100 -a 10
conf-7,passim,-n 3 --min-match 2 --max-repeat 100 -a 10
conf-8,seriatim,-n 10 --min-match 1 -a 10 --minDF 2 --maxDF 100
conf-8a,seriatim,-n 10 --min-match 1 -a 5 --minDF 2 --maxDF 100
conf-8b,seriatim,-n 10 --min-match 1 -a 50 --minDF 2 --maxDF 100
conf-8c,seriatim,-n 10 --min-match 1 -a 60 --minDF 2 --maxDF 100
conf-9,seriatim,-n 40 --min-match 1 -a 10 --minDF 2 --maxDF 100
