# Run Passim

## Imports

In [8]:
import pandas as pd
import os
import sys
sys.path.append('..')
from lib.utils import *
from lib.evaluation import *

## Experiments

| Experiment ID     | Passim parameters | Explanation  | \# of extracted clusters | Evaluation |
| ----------- | ----------- |----------|------------|-------------------|
| `exp0`      | `-n 1 --min-match 1 -a 5` | reused passages consist of at least 1 shared n-grams of size 1 (uni-gram) | 6419 (lemmatised); 7993 (raw) | # of ground-truth cluster: 69, # of matched clusters: 22, # of unmatched clusters: 47, # of partially matched clusters: 2, # of exactly matched clusters: 2, # of clusters with spurious passages: 18|
| `exp4`|`-n 2 --min-match 2 --max-repeat 100 -a 10`|reused passages consist of at least 2 shared n-grams of size 2 (bi-grams), and the aligned passage should be at least 10 characters long (default is `20`)|1909 (lemmatised), 2078 (raw)|# of ground-truth cluster: 69, # of matched clusters: 46, # of unmatched clusters: 23, # of partially matched clusters: 2, # of exactly matched clusters: 20, # of clusters with spurious passages: 24|
| `exp5`|`-n 3 --min-match 1 --max-repeat 100 -a 10`|reused passages consist of at least 1 shared n-grams of size 3 (tri-grams), and the aligned passage should be at least 10 characters long (default is `20`)|350 (lemmatised), 431 (raw)|# of ground-truth cluster: 69, # of matched clusters: 58, # of unmatched clusters: 11, # of partially matched clusters: 3, # of exactly matched clusters: 44, # of clusters with spurious passages: 11|
| `exp6`|`-n 4 --min-match 1 --max-repeat 100 -a 10`|reused passages consist of at least 1 shared n-grams of size 4 (bi-grams), and the aligned passage should be at least 10 characters long (default is `20`)|284 (lemmatised), 341 (raw)|# of ground-truth cluster: 69, # of matched clusters: 58, # of unmatched clusters: 11, # of partially matched clusters: 3, # of exactly matched clusters: 47, # of clusters with spurious passages: 8|
| `exp7`|`-n 3 --min-match 2 --max-repeat 100 -a 10`|reused passages consist of at least 1 shared n-grams of size 3 (tri-grams), and the aligned passage should be at least 10 characters long (default is `20`)|350 (lemmatised), 431 (raw)|# of ground-truth cluster: 69, # of matched clusters: 58, # of unmatched clusters: 11, # of partially matched clusters: 3, # of exactly matched clusters: 44, # of clusters with spurious passages: 11|

### Experiment 0 (exp0)

In [23]:
# configuration
output_path = 'data/passim/exp0/'
tsv_path = 'data/output/passim_clusters_exp0.csv'

In [27]:
# cleaning passim's output folder
!rm -r {output_path}

In [9]:
# run passim on lemmatised and filtered speeches
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 1 --min-match 1 -a 5 --max-repeat 100 -w 1 data/input/homeric_speeches_lemmatised.json {output_path}

:: loading settings :: url = jar:file:/Applications/spark-3.3.2/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/mromanel/.ivy2/cache
The jars for the packages stored in: /Users/mromanel/.ivy2/jars
com.github.scopt#scopt_2.12 added as a dependency
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7157a7f0-389d-40b2-a698-ef7976e1438c;1.0
	confs: [default]
	found com.github.scopt#scopt_2.12;3.5.0 in central
	found graphframes#graphframes;0.8.0-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 169ms :: artifacts dl 5ms
	:: modules in use:
	com.github.scopt#scopt_2.12;3.5.0 from central in [default]
	graphframes#graphframes;0.8.0-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  

In [25]:
# run passim on raw text speeches (no lemmatisation)
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 1 --min-match 1 -a 5 --max-repeat 100 -w 1 data/input/homeric_speeches_raw.json {output_path}

:: loading settings :: url = jar:file:/Applications/spark-3.3.2/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/mromanel/.ivy2/cache
The jars for the packages stored in: /Users/mromanel/.ivy2/jars
com.github.scopt#scopt_2.12 added as a dependency
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f8a030d3-60e7-4b34-8369-12f5c637355f;1.0
	confs: [default]
	found com.github.scopt#scopt_2.12;3.5.0 in central
	found graphframes#graphframes;0.8.0-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 166ms :: artifacts dl 5ms
	:: modules in use:
	com.github.scopt#scopt_2.12;3.5.0 from central in [default]
	graphframes#graphframes;0.8.0-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  

In [32]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
#tr_clusters = passim_output_to_dataframe(passim_json_output_path, tsv_path)
tr_clusters = passim_output_to_dataframe(
    passim_json_output_path, 
    tsv_path,
    columns_to_keep=['cluster', 'id', 'label', 'dices_tags', 'dices_speech_id', 'text']
)

There are 7993 text reuse clusters in data/passim/exp0/out.json/


In [33]:
tr_clusters.head(10)

Unnamed: 0,cluster,id,label,dices_tags,dices_speech_id,text
0,0,0,"Homer, Iliad 8.352-8.356",que,223,ὢ πόποι αἰγιόχοιο Διὸς τέκος οὐκέτι νῶϊ ὀλλυμέ...
1,0,1,"Homer, Iliad 22.450-22.459",del,605,"δεῦτε δύω μοι ἕπεσθον, ἴδωμʼ ὅτινʼ ἔργα τέτυκτ..."
2,0,2,"Homer, Iliad 8.358-8.380",del|des|lam,224,καὶ λίην οὗτός γε μένος θυμόν τʼ ὀλέσειε χερσὶ...
3,0,3,"Homer, Iliad 22.477-22.514",lam|ora,606,"ἄρα γεινόμεθʼ αἴσῃ ἀμφότεροι, σὺ μὲν ἐν Τροίῃ ..."
4,0,4,"Homer, Iliad 8.399-8.408",com,225,"βάσκʼ ἴθι Ἶρι ταχεῖα, πάλιν τρέπε μηδʼ ἔα ἄντη..."
5,0,6,"Homer, Iliad 23.6-23.11",del,608,Μυρμιδόνες ταχύπωλοι ἐμοὶ ἐρίηρες ἑταῖροι μὴ δ...
6,0,7,"Homer, Iliad 8.413-8.424",mes|que|war,226,πῇ μέματον; τί σφῶϊν ἐνὶ φρεσὶ μαίνεται ἦτορ; ...
7,0,8,"Homer, Iliad 8.427-8.431",del,227,"ὢ πόποι αἰγιόχοιο Διὸς τέκος, οὐκέτʼ ἔγωγε νῶϊ..."
8,0,9,"Homer, Iliad 23.19-23.23",vow,609,Πάτροκλε καὶ εἰν Ἀΐδαο δόμοισι· πάντα γὰρ ἤδη ...
9,0,10,"Homer, Iliad 8.447-8.456",lau|que|tau,228,οὕτω τετίησθον Ἀθηναίη τε καὶ Ἥρη; οὐ μέν θην ...


### Experiment 4 (exp4)

In [34]:
# configuration
output_path = 'data/passim/exp4/'
tsv_path = 'data/output/passim_clusters_exp4.csv'

In [35]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on lemmatised and filtered speeches
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 2 --min-match 2 --max-repeat 100 -a 10 data/input/homeric_speeches_lemmatised.json {output_path}

:: loading settings :: url = jar:file:/Applications/spark-3.3.2/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/mromanel/.ivy2/cache
The jars for the packages stored in: /Users/mromanel/.ivy2/jars
com.github.scopt#scopt_2.12 added as a dependency
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7157a7f0-389d-40b2-a698-ef7976e1438c;1.0
	confs: [default]
	found com.github.scopt#scopt_2.12;3.5.0 in central
	found graphframes#graphframes;0.8.0-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 169ms :: artifacts dl 5ms
	:: modules in use:
	com.github.scopt#scopt_2.12;3.5.0 from central in [default]
	graphframes#graphframes;0.8.0-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  

In [36]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(passim_json_output_path, tsv_path)

There are 1909 text reuse clusters in data/passim/exp4/out.json/


In [37]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on raw text speeches (no lemmatisation)
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 2 --min-match 2 --max-repeat 100 -a 10 data/input/homeric_speeches_raw.json {output_path}

:: loading settings :: url = jar:file:/Applications/spark-3.3.2/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/mromanel/.ivy2/cache
The jars for the packages stored in: /Users/mromanel/.ivy2/jars
com.github.scopt#scopt_2.12 added as a dependency
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f8a030d3-60e7-4b34-8369-12f5c637355f;1.0
	confs: [default]
	found com.github.scopt#scopt_2.12;3.5.0 in central
	found graphframes#graphframes;0.8.0-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 166ms :: artifacts dl 5ms
	:: modules in use:
	com.github.scopt#scopt_2.12;3.5.0 from central in [default]
	graphframes#graphframes;0.8.0-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  

In [38]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(
    passim_json_output_path, 
    tsv_path,
    columns_to_keep=['cluster', 'id', 'label', 'dices_tags', 'dices_speech_id', 'text']
)

There are 2078 text reuse clusters in data/passim/exp4/out.json/


In [None]:
tr_clusters.head(10)

Unnamed: 0,cluster,id,label,dices_tags,dices_speech_id,text
0,0,0,"Homer, Iliad 8.352-8.356",que,223,ὢ πόποι αἰγιόχοιο Διὸς τέκος οὐκέτι νῶϊ ὀλλυμέ...
1,0,1,"Homer, Iliad 22.450-22.459",del,605,"δεῦτε δύω μοι ἕπεσθον, ἴδωμʼ ὅτινʼ ἔργα τέτυκτ..."
2,0,2,"Homer, Iliad 8.358-8.380",del|des|lam,224,καὶ λίην οὗτός γε μένος θυμόν τʼ ὀλέσειε χερσὶ...
3,0,3,"Homer, Iliad 22.477-22.514",lam|ora,606,"ἄρα γεινόμεθʼ αἴσῃ ἀμφότεροι, σὺ μὲν ἐν Τροίῃ ..."
4,0,4,"Homer, Iliad 8.399-8.408",com,225,"βάσκʼ ἴθι Ἶρι ταχεῖα, πάλιν τρέπε μηδʼ ἔα ἄντη..."
5,0,6,"Homer, Iliad 23.6-23.11",del,608,Μυρμιδόνες ταχύπωλοι ἐμοὶ ἐρίηρες ἑταῖροι μὴ δ...
6,0,7,"Homer, Iliad 8.413-8.424",mes|que|war,226,πῇ μέματον; τί σφῶϊν ἐνὶ φρεσὶ μαίνεται ἦτορ; ...
7,0,8,"Homer, Iliad 8.427-8.431",del,227,"ὢ πόποι αἰγιόχοιο Διὸς τέκος, οὐκέτʼ ἔγωγε νῶϊ..."
8,0,9,"Homer, Iliad 23.19-23.23",vow,609,Πάτροκλε καὶ εἰν Ἀΐδαο δόμοισι· πάντα γὰρ ἤδη ...
9,0,10,"Homer, Iliad 8.447-8.456",lau|que|tau,228,οὕτω τετίησθον Ἀθηναίη τε καὶ Ἥρη; οὐ μέν θην ...


### Experiment 5 (exp5)

In [39]:
# configuration
output_path = 'data/passim/exp5/'
tsv_path = 'data/output/passim_clusters_exp5.csv'

In [40]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on lemmatised and filtered speeches
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 3 --min-match 1 --max-repeat 100 -a 10 data/input/homeric_speeches_lemmatised.json {output_path}

In [41]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(passim_json_output_path, tsv_path)

There are 350 text reuse clusters in data/passim/exp5/out.json/


In [42]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on raw text speeches (no lemmatisation)
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 3 --min-match 1 --max-repeat 100 -a 10 data/input/homeric_speeches_raw.json {output_path}

In [43]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(
    passim_json_output_path, 
    tsv_path,
    columns_to_keep=['cluster', 'id', 'label', 'dices_tags', 'dices_speech_id', 'text']
)

There are 431 text reuse clusters in data/passim/exp5/out.json/


### Experiment 6 (exp6)

In [3]:
# configuration
output_path = '../data/passim/exp6/'
tsv_path = '../data/output/passim_clusters_exp6.csv'

In [10]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on lemmatised and filtered speeches
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 4 --min-match 1 --max-repeat 100 -a 10 data/input/homeric_speeches_lemmatised.json {output_path}

In [9]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(passim_json_output_path, tsv_path)

There are 431 text reuse clusters in ../data/passim/exp6/out.json/


KeyError: "['raw_text'] not in index"

In [47]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on raw text speeches (no lemmatisation)
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 4 --min-match 1 --max-repeat 100 -a 10 data/input/homeric_speeches_raw.json {output_path}

In [11]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(
    passim_json_output_path, 
    tsv_path,
    columns_to_keep=['cluster', 'id', 'label', 'dices_tags', 'dices_speech_id', 'text', 'speaker', 'addressee']
)

There are 341 text reuse clusters in ../data/passim/exp6/out.json/


### Experiment 7 (exp7)

In [49]:
# configuration
output_path = 'data/passim/exp7/'
tsv_path = 'data/output/passim_clusters_exp7.csv'

In [50]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on lemmatised and filtered speeches
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 3 --min-match 2 --max-repeat 100 -a 10 data/input/homeric_speeches_lemmatised.json {output_path}

In [51]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(passim_json_output_path, tsv_path)

There are 350 text reuse clusters in data/passim/exp7/out.json/


In [52]:
# cleaning passim's output folder
!rm -r {output_path}

In [None]:
# run passim on raw text speeches (no lemmatisation)
! /Users/mromanel/Documents/passim-1.0.0/bin/passim -n 3 --min-match 2 --max-repeat 100 -a 10 data/input/homeric_speeches_raw.json {output_path}

In [53]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(
    passim_json_output_path, 
    tsv_path,
    columns_to_keep=['cluster', 'id', 'label', 'dices_tags', 'dices_speech_id', 'text']
)

There are 431 text reuse clusters in data/passim/exp7/out.json/


## Experiments (`seriatim`)

### Experiment 8 (exp8)

In [1]:
!seriatim --help

https://repos.spark-packages.org/ added as a remote repository with the name: repo-1
:: loading settings :: url = jar:file:/Users/matteo/.pyenv/versions/3.10.0/envs/homeric-repetitions/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/matteo/.ivy2/cache
The jars for the packages stored in: /Users/matteo/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-62255efc-621b-44b1-85d2-5d33a3e384f9;1.0
	confs: [default]
	found graphframes#graphframes;0.8.0-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in spark-list
:: resolution report :: resolve 183ms :: artifacts dl 7ms
	:: modules in use:
	graphframes#graphframes;0.8.0-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from spark-list in [default]
	---------------------------------------------------------------------
	|                  |  

In [3]:
# configuration
output_path = 'data/seriatim/exp8/'
tsv_path = 'data/output/seriatim_clusters_exp8.csv'

In [4]:
# cleaning passim's output folder
!rm -r {output_path}

rm: data/seriatim/exp8/: No such file or directory


In [None]:
# run passim on raw text speeches (no lemmatisation)
!seriatim -n 10 --min-match 1 -a 10 --minDF 2 --maxDF 100 data/input/homeric_speeches_raw.json {output_path}

In [None]:
passim_json_output_path = os.path.join(output_path, 'out.json/')
tr_clusters = passim_output_to_dataframe(
    passim_json_output_path, 
    tsv_path,
    columns_to_keep=['cluster', 'id', 'label', 'dices_tags', 'dices_speech_id', 'text']
)

There are 431 text reuse clusters in data/passim/exp7/out.json/
