In [62]:
magma_dir = '/home/marco/epfl/magma/'

### **Config**

In [63]:
import os
import sys

sys.path.insert(0, magma_dir)
import config

In [64]:
MODEL = 'bart'

RE_SPLITTER = '\n'              # do we split sentences of paragraphs?
                                # use '\.(?!\d)|\n' or '\n', respectively

TOKEN_MAX_LEN = 99              # max length of a word
PARA_MIN_LENGTH = 2             # minimum length for a sentence or
                                # a paragraph, in tokens

# Output path
OUTPUT_PATH = magma_dir+'pipeline/karger_books_para_extraction/'+MODEL+'/'
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

In [65]:
# Topic modeling specific configurations

REDUCTION_MAX_LEN = 1024        # maximum length of the LDA/LSI/TextRank reduction

STOPWORDS_EXTENSION =\
    ['may', 'might',
     'also', 'with',
     'without', 'use',
     'uses', 'used', 'using']

STEMMER = 'snowball'            # name of the stemmer, might use 'porter'

N_GRAM = 2                      # the length of n-gram we want to create
N_GRAM_MIN_COUNT = 2            # there should be at least N_GRAM_MIN_COUNT
                                # repetitions in the text
N_GRAM_THRESHOLD = 20           # see gensim.Phrases documentation

DIC_NO_BELOW = 3                # keep tokens present in DIC_NO_BELOW+ sentences/paragraphs
DIC_NO_ABOVE = 1                # fraction of total corpus size (default: 1)

TOP_N = 30                      # number of words to keep for each topic

### **Init**

In [66]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import re
import pandas as pd
import nltk
import gensim
from textwrap import fill
from tqdm import tqdm
tqdm.pandas()

if 'pegasus' in MODEL:
    from transformers import PegasusTokenizer
    tokenizer =\
        PegasusTokenizer.from_pretrained('google/pegasus-large')
elif 'bart' in MODEL:
    from transformers import BartTokenizer
    tokenizer =\
        BartTokenizer.from_pretrained('facebook/bart-large-cnn')
elif 't5' in MODEL:
    from transformers import T5Tokenizer
    tokenizer =\
        T5Tokenizer.from_pretrained('t5-large')

### **Karger Books Base Dataset**

In [67]:
base_dataset = magma_dir+'datasets/karger_books_base/df.csv'
df = pd.read_csv(base_dataset)
df = df.set_index(['book', 'chapter', 'section', 'subsection'])
df.bullets = df.bullets.map(eval, na_action='ignore')

### **Preprocessing**

#### Preprocessing

* Split based on RE_SPLITTER
* Explode the dataset
* Remove unwanted chars at beginning or end of sentence
* Remove multiple spaces
* Remove long words (> TOKEN_MAX_LEN chars)

In [68]:
# Split in sentences / paragraphs based on RE_SPLITTER
df.text =\
    df.text.map(lambda x: [p.strip() for p in re.split(RE_SPLITTER, x) if p!=''],
                na_action='ignore')
    
# explode to get one row for each paragraph /sentence
df = df.explode('text')
df = df.rename(columns={'text': 'para'})
df = df.dropna()

# Remove unwanted chars at beginning or end of sentence
df.para = df.para.map(lambda p: p.lstrip('.,;:-)] \n'))
df.para = df.para.map(lambda p: p.rstrip('.,;:-([ \n'))

# Remove multiple spaces
df.para = df.para.map(lambda p:
    re.sub('\s+', ' ', p).strip())

# Remove long words (> TOKEN_MAX_LEN chars)
def para2words(para):
    return gensim.utils.simple_preprocess(
        para, deacc=True, max_len=TOKEN_MAX_LEN)
df['para_proc'] = df.para.map(para2words)

#### Further Preprocessing

* Remove stop words
* Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)

In [69]:
nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df.para_proc = df.para_proc.map(lambda p:
    [w for w in p if w not in stop_words])

[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
# Remove short sentences / paragraphs (< PARA_MIN_LENGTH tokens)
df.loc[df.para_proc.map(len) <\
    PARA_MIN_LENGTH, 'para_proc'] = np.nan

df = df.dropna()

In [71]:
df.para = df.para.map(lambda p: p+'.')

### **Topic Modeling Creation**

In [72]:
# Stem
if 'port' in STEMMER:
    from nltk.stem.porter import PorterStemmer
    st = PorterStemmer()
elif 'snow' in STEMMER:
    from nltk.stem.snowball import SnowballStemmer
    st = SnowballStemmer('english')

df.para_proc = df.para_proc.map(lambda p:
    [st.stem(w) for w in p], na_action='ignore')

In [73]:
# Create n-grams (N_GRAM)
data_words = df.para_proc.dropna().values.tolist()

if N_GRAM == 2:
    bigram = gensim.models.Phrases(
        data_words,
        min_count=N_GRAM_MIN_COUNT,
        threshold=N_GRAM_THRESHOLD)

    df.para_proc = df.para_proc.map(lambda p:
        [b for b in bigram[p]], na_action='ignore')
    
elif N_GRAM == 3:
    trigram = gensim.models.Phrases(
        bigram[data_words],
        min_count=N_GRAM_MIN_COUNT,
        threshold=N_GRAM_THRESHOLD)
    
    df.para_proc = df.para_proc.map(lambda p:
        [b for b in trigram[bigram[p]]], na_action='ignore')

#### Dictionary (DIC_NO_BELOW, DIC_NO_ABOVE)

In [74]:
# Create dictionary for topic model (DIC_NO_BELOW, DIC_NO_ABOVE)
book_ch_comb = set(zip(df.index.get_level_values(0),
    df.index.get_level_values(1)))

id2word = {}
for book, ch in book_ch_comb:
    if book not in id2word:
        id2word[book] = {}

    id2word[book][ch] = gensim.corpora.Dictionary(
        df.loc[book, ch].para_proc.dropna().values.tolist() )

    id2word[book][ch].filter_extremes(
        no_below = DIC_NO_BELOW, no_above = DIC_NO_ABOVE)

  # This is added back by InteractiveShellApp.init_path()


#### LDA

https://radimrehurek.com/gensim_3.8.3/models/ldamodel.html

https://www.di.ens.fr/~fbach/mdhnips2010.pdf

Keep N_TOP words.

In [75]:
%%capture
def get_lda_model(df, book, ch):
    corpus = df.loc[book, ch].para_proc.map(id2word[book][ch].doc2bow,
        na_action='ignore').dropna().values.tolist()

    return gensim.models.ldamodel.LdaModel(
        corpus = corpus,
        num_topics = 1,
        id2word = id2word[book][ch],
        alpha = 'auto',
        random_state = config.SEED)

lda_word2prob = {}
for book, ch in book_ch_comb:       
    if book not in lda_word2prob:
        lda_word2prob[book] = {}

    lda_word2prob[book][ch] = dict(\
        get_lda_model(df, book, ch).show_topic(0, TOP_N))

### **Paragraph importance**

\begin{equation}
\text{importance}(p) = \frac{\sum_{w \in p} \text{probability}(w)}{\sqrt{\text{length}(p)}}\quad\quad\text{what about}\quad\frac{\sum_{w \in p} \text{probability}(w)}{log(\text{length}(p))}\quad?
\end{equation}

Where $p$ is a paragraph or a sentence, $w$ is a word (token) and $\text{probability}$ is the probability assigned by LDA or LSI model to $w$ (0 if it is not present in the TOP_N words).

In [76]:
def word_importance(model_word2prob, word):
    return model_word2prob.get(word, 0)

def para_importance(model_word2prob, para):
    l_importance = [word_importance(model_word2prob, w) for w in para]
    return  np.sum(l_importance) / np.sqrt(len(l_importance))

df['lda_imp'] = np.nan
df['lda_imp_norm'] = np.nan
for book, ch in book_ch_comb:
    idx_slice = pd.IndexSlice[book, ch, :, :]
    # getting LDA and LSI importance
    df.loc[idx_slice, 'lda_imp'] = df.loc[idx_slice, 'para_proc'].map(lambda p:
        para_importance(lda_word2prob[book][ch], p), na_action='ignore')

    # normalizing
    s = df.loc[idx_slice, 'lda_imp']

In [77]:
df_num_sec_bul = df.groupby(['book', 'chapter', 'section'], sort=False).agg({
    'para': lambda p: list(p),
    'bullets': lambda b: list(b)[0]
}).groupby(['book', 'chapter'], sort=False).agg({
    'para': lambda p: len(list(p)),
    'bullets': lambda b: len(list(b)[0])
}).rename(columns={'para':'num_sec', 'bullets':'num_bul'})

In [78]:
df_best_para = df.reset_index(level=[2, 3]).groupby(['book', 'chapter'], sort=False).agg({
    'section': lambda s: list(s),
    'subsection': lambda ss: list(ss),
    'para': lambda p: list(p),
    'bullets': lambda b: list(b)[0],
    'para_proc': lambda pp: list(pp),
    'lda_imp': lambda lda: list(lda)
})
df_best_para.lda_imp = df_best_para.lda_imp.map(np.array)

In [79]:
df_best_para['num_sec'] = 0
for idx in df_best_para.index.tolist():
    df_best_para.loc[idx, 'num_sec'] = int(df_num_sec_bul.loc[idx, 'num_sec'])

##### Find Best Paragraph for each Book, Chapter (and Section)

In [80]:
df_best_para['best_para'] = df_best_para.apply(lambda row: np.argsort(row.lda_imp)[::-1][:row.num_sec], axis=1)

##### How Many Sections Are We Covering?

In [81]:
def calculate_diversity(r):
    best_idx = r.best_para
    all_sections = set(r.section)
    
    selected_sections = set([r.section[i] for i in best_idx])
    
    return len(selected_sections.intersection(all_sections))/len(all_sections)*100
    
df_best_para.apply(calculate_diversity, axis=1).describe()

count    453.000000
mean      58.253368
std       16.016781
min       16.666667
25%       50.000000
50%       60.000000
75%       70.000000
max      100.000000
dtype: float64

##### Expanding from Best Paragraph Based on Paragraph Importance

In [82]:
df_best_para['para_num_tok'] =\
    df_best_para.para.map(lambda ps: np.array([len(tokenizer.tokenize(p)) for p in ps]))

In [83]:
def expand_based_on_importance(r):
    max_length = len(r.para)
    max_idx = max_length-1
    
    extracted_para = []
    
    # Calculate the fraction we need to extract
    # based on total number of tokens in this chp
    # and number of centroids (sections) in this chp
    # do not go over the model max length
    num_tok_tot = sum(r.para_num_tok)
    num_tok_th = min(
        int(0.8*num_tok_tot / len(r.best_para)),
        0.9*tokenizer.model_max_length)
    
    for i, best in enumerate(r.best_para):
        merged_para_idx = [best]
        num_tok = r.para_num_tok[best]
        
        while num_tok < num_tok_th:
            if len(merged_para_idx) == max_length : break
            elif 0 in merged_para_idx:
                merged_para_idx.append(max(merged_para_idx)+1)
            elif max_idx in merged_para_idx:
                merged_para_idx.append(min(merged_para_idx)-1)
            else:
                if (r.lda_imp)[min(merged_para_idx)-1] <\
                    (r.lda_imp)[max(merged_para_idx)+1]:
                    merged_para_idx.append(max(merged_para_idx)+1)
                else:
                    merged_para_idx.append(min(merged_para_idx)-1)
            num_tok = np.sum(r.para_num_tok[merged_para_idx])
                  
        extracted_para.append(sorted(merged_para_idx))
        
    return extracted_para

df_best_para['selected_para'] =\
    df_best_para.progress_apply(expand_based_on_importance, axis=1)

100%|██████████| 453/453 [00:00<00:00, 847.14it/s]


##### Study Overlap and Remove Useless (>90% overlap) Entries

In [84]:
df_remove_overlap = df_best_para.copy(deep=True)

In [85]:
def create_overlap_matrix(r):
    num_sec = len(r.selected_para)
    overlap_matrix = np.zeros((num_sec,num_sec))
    
    def list_overlap(a, b):
        return list( set(a).intersection(set(b)) )
    
    for i in range(num_sec):
        for j in range(num_sec):
            if i == j : continue
            num_tok_i = np.sum(r.para_num_tok[r.selected_para[i]])
            overlap = list_overlap(
                r.selected_para[i], r.selected_para[j])
            num_tok_overlap = np.sum(r.para_num_tok[overlap])
            assert num_tok_overlap <= num_tok_i
            
            overlap_matrix[i, j] = round(num_tok_overlap/num_tok_i*100, 2)
    
    return overlap_matrix

def remove_big_overlap(r, threshold):
    om = r.overlap_matrix
    big_overlap_idx = np.argwhere(om >= threshold)
    to_be_removed = set()
    for idx in big_overlap_idx:
        i, j = idx[0], idx[1]
        if om[i, j] == om[j, i]:
            if i in to_be_removed or j in to_be_removed : continue
            else : to_be_removed.add(i)
        elif om[i, j] > om[j, i]:
            to_be_removed.add(i)
        else:
            to_be_removed.add(j)
    return [s for i, s in enumerate(r.selected_para) if i not in to_be_removed]

In [86]:
df_remove_overlap['overlap_matrix'] = df_remove_overlap.apply(lambda row: create_overlap_matrix(row), axis=1)
df_remove_overlap.overlap_matrix.map(lambda om: np.sum(om > 90)).describe()

count    453.000000
mean       3.653422
std        3.084570
min        0.000000
25%        2.000000
50%        2.000000
75%        5.000000
max       20.000000
Name: overlap_matrix, dtype: float64

In [87]:
df_remove_overlap.selected_para = df_remove_overlap.apply(lambda row: remove_big_overlap(row, 90), axis=1)

df_remove_overlap['overlap_matrix'] = df_remove_overlap.apply(lambda row: create_overlap_matrix(row), axis=1)
df_remove_overlap.overlap_matrix.map(lambda om: np.sum(om > 90)).describe()

count    453.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: overlap_matrix, dtype: float64

##### Merge when >90% overlap Entries

In [146]:
df_merge_overlap = df_best_para.copy()

In [147]:
def find_big_overlap(r, threshold):
    om = r.overlap_matrix
    big_overlap_idx = np.argwhere(om >= threshold)
    big_overlap_idx = set([frozenset(t) for t in big_overlap_idx])
    merged = set()
    to_be_merged = set()
    for idx in big_overlap_idx:
        idx = tuple(idx)
        i, j = idx[0], idx[1]
        if i not in merged and j not in merged:
            to_be_merged.add(idx)
            merged.add(i)
            merged.add(j)
    return to_be_merged

def merge_para(r):
    new_selected_para = np.empty((len(r.selected_para),), dtype=object)
    for i, j in r.to_be_merged:
        new_selected_para[i] = np.array(list(set(
            np.concatenate((r.selected_para[i], r.selected_para[j])))))
        
        new_selected_para[j] = []
        
    selected_para = []
    for i, x in enumerate(new_selected_para):
        if x is None:
            selected_para.append(np.array(r.selected_para[i]))
        elif len(x) == 0 : continue
        else:
            selected_para.append(x)
    return selected_para

In [148]:
while True:
    df_merge_overlap['overlap_matrix'] = df_merge_overlap.apply(create_overlap_matrix, axis=1)

    df_merge_overlap['to_be_merged'] = df_merge_overlap.apply(lambda row: find_big_overlap(row, 90), axis=1)

    num_to_be_merged = df_merge_overlap.to_be_merged.map(len).sum()
    print('Para to be merged: %d'%num_to_be_merged)
    if (num_to_be_merged <= 0) : break

    df_merge_overlap.selected_para = df_merge_overlap.apply(merge_para, axis=1)
    
df_merge_overlap.selected_para = df_merge_overlap.selected_para.map(list)

Para to be merged: 643
Para to be merged: 110
Para to be merged: 4
Para to be merged: 0


### Finalize Results Remove Overlap

In [130]:
df_remove_overlap_tobesaved = df_remove_overlap.explode('selected_para')
df_remove_overlap_tobesaved = df_remove_overlap_tobesaved.drop(
    columns=['best_para', 'lda_imp', 'overlap_matrix'])

df_remove_overlap_tobesaved['selected_para_lda'] = df_remove_overlap_tobesaved.apply(lambda row:\
    [p for i, p in enumerate(row.para) if i in row.selected_para], axis=1)

df_remove_overlap_tobesaved['para_num_tok'] = df_remove_overlap_tobesaved.apply(lambda row:\
    [p for i, p in enumerate(row.para_num_tok) if i in row.selected_para], axis=1)

In [131]:
df_remove_overlap_tobesaved.para_num_tok.map(sum).describe()

count    1887.000000
mean      461.736089
std       207.686768
min       112.000000
25%       309.000000
50%       412.000000
75%       565.000000
max      1255.000000
Name: para_num_tok, dtype: float64

In [133]:
df_remove_overlap_tobesaved.to_csv(OUTPUT_PATH+'df_lda_remove.csv')

##### Compare to Para Wordembed ST Dataset

In [134]:
def precision(pred, ref):
    return round(100*len(pred.intersection(ref)) / len(pred), 2)
    
def recall(pred, ref):
    return round(100*len(pred.intersection(ref)) / len(ref), 2)

def fmeasure(prec, rec):
    if prec + rec == 0 : return 0
    return round(2*prec*rec/(prec+rec), 2)

flatten = lambda t: [item for sublist in t for item in sublist]

In [136]:
for t in ['base', 'th']:
    df_para_wordembed_st =\
        pd.read_csv(magma_dir+'datasets/karger_books_para_wordembed/'+MODEL+'/st/df_'+t+'_selected_para.csv')\
        .set_index(['book', 'chapter'])
    df_para_wordembed_st.para = df_para_wordembed_st.para.map(eval)
    df_para_wordembed_st.best_match = df_para_wordembed_st.best_match.map(eval)

    num_para = df_para_wordembed_st.para.map(len)

    df_remove_overlap_tobesaved = pd.read_csv(OUTPUT_PATH+'df_lda_remove.csv').set_index(['book', 'chapter'])
    df_remove_overlap_tobesaved.selected_para = df_remove_overlap_tobesaved.selected_para.map(eval)
    df_remove_overlap_tobesaved = df_remove_overlap_tobesaved.groupby(['book', 'chapter'], sort=False).agg({
        'selected_para': lambda p: list(p)
    })
    df_remove_overlap_tobesaved.selected_para = df_remove_overlap_tobesaved.selected_para

    selected_para = df_remove_overlap_tobesaved.selected_para.map(lambda pp: set(flatten(pp)))
    best_match = df_para_wordembed_st.best_match.map(set)

    df_comparison = pd.concat([num_para, selected_para, best_match], axis=1).rename(
        columns={'para': 'num_para', 'selected_para': 'unsup_selected', 'best_match': 'sup_selected'})
    df_comparison['unsup_coverage'] = 100*df_comparison.unsup_selected.map(len) / df_comparison.num_para
    df_comparison['sup_coverage'] = 100*df_comparison.sup_selected.map(len) / df_comparison.num_para
    
    df_comparison['intersection'] = 100*df_comparison.apply(lambda r:
        len(r.unsup_selected.intersection(r.sup_selected)) / r.num_para, axis=1)

    df_comparison['precision'] = df_comparison.apply(lambda r:
        precision(r.unsup_selected, r.sup_selected), axis=1)
    df_comparison['recall'] = df_comparison.apply(lambda r:
        recall(r.unsup_selected, r.sup_selected), axis=1)
    
    df_comparison.drop(columns='num_para', inplace=True)
    
    print(t+'\n')
    print(df_comparison.describe())
    print('\n')

base

       unsup_coverage  sup_coverage  intersection   precision      recall
count      453.000000    453.000000    453.000000  453.000000  453.000000
mean        49.705078     29.231305     17.228806   34.619536   59.719360
std         13.343276     15.454056     11.213288   20.162173   24.551033
min         11.250000      2.564103      0.000000    0.000000    0.000000
25%         40.000000     18.000000      9.090909   19.050000   42.860000
50%         50.000000     25.714286     15.000000   31.030000   58.820000
75%         57.894737     38.095238     23.076923   47.620000   77.780000
max         88.888889    100.000000     75.000000  100.000000  100.000000


th

       unsup_coverage  sup_coverage  intersection   precision      recall
count      453.000000    453.000000    453.000000  453.000000  453.000000
mean        49.705078     63.226644     35.402266   71.756380   56.502958
std         13.343276     15.442000     13.306320   20.265302   17.773207
min         11.250000     

### Finalize Results Merge Overlap

In [165]:
df_merge_overlap_tobesaved = df_merge_overlap.explode('selected_para')
df_merge_overlap_tobesaved.selected_para = df_merge_overlap_tobesaved.selected_para.map(list)
df_merge_overlap_tobesaved = df_merge_overlap_tobesaved.drop(
    columns=['best_para', 'lda_imp', 'overlap_matrix', 'to_be_merged'])

df_merge_overlap_tobesaved['selected_para_lda'] = df_merge_overlap_tobesaved.apply(lambda row:\
    [p for i, p in enumerate(row.para) if i in row.selected_para], axis=1)

df_merge_overlap_tobesaved['para_num_tok'] = df_merge_overlap_tobesaved.apply(lambda row:\
    [p for i, p in enumerate(row.para_num_tok) if i in row.selected_para], axis=1)

In [167]:
df_merge_overlap_tobesaved.para_num_tok.map(sum).describe()

count    1887.000000
mean      462.031797
std       208.060483
min       112.000000
25%       309.000000
50%       412.000000
75%       566.000000
max      1255.000000
Name: para_num_tok, dtype: float64

In [168]:
df_merge_overlap_tobesaved.to_csv(OUTPUT_PATH+'df_lda_merge.csv')

##### Compare to Para Wordembed ST Dataset

In [169]:
for t in ['base', 'th']:
    df_para_wordembed_st =\
        pd.read_csv(magma_dir+'datasets/karger_books_para_wordembed/'+MODEL+'/st/df_'+t+'_selected_para.csv')\
        .set_index(['book', 'chapter'])
    df_para_wordembed_st.para = df_para_wordembed_st.para.map(eval)
    df_para_wordembed_st.best_match = df_para_wordembed_st.best_match.map(eval)

    num_para = df_para_wordembed_st.para.map(len)

    df_merge_overlap_tobesaved = pd.read_csv(OUTPUT_PATH+'df_lda_merge.csv').set_index(['book', 'chapter'])
    df_merge_overlap_tobesaved.selected_para = df_merge_overlap_tobesaved.selected_para.map(eval)
    df_merge_overlap_tobesaved = df_merge_overlap_tobesaved.groupby(['book', 'chapter'], sort=False).agg({
        'selected_para': lambda p: list(p)
    })
    df_merge_overlap_tobesaved.selected_para = df_merge_overlap_tobesaved.selected_para

    selected_para = df_merge_overlap_tobesaved.selected_para.map(lambda pp: set(flatten(pp)))
    best_match = df_para_wordembed_st.best_match.map(set)

    df_comparison = pd.concat([num_para, selected_para, best_match], axis=1).rename(
        columns={'para': 'num_para', 'selected_para': 'unsup_selected', 'best_match': 'sup_selected'})
    df_comparison['unsup_coverage'] = 100*df_comparison.unsup_selected.map(len) / df_comparison.num_para
    df_comparison['sup_coverage'] = 100*df_comparison.sup_selected.map(len) / df_comparison.num_para
    
    df_comparison['intersection'] = 100*df_comparison.apply(lambda r:
        len(r.unsup_selected.intersection(r.sup_selected)) / r.num_para, axis=1)

    df_comparison['precision'] = df_comparison.apply(lambda r:
        precision(r.unsup_selected, r.sup_selected), axis=1)
    df_comparison['recall'] = df_comparison.apply(lambda r:
        recall(r.unsup_selected, r.sup_selected), axis=1)
    
    df_comparison.drop(columns='num_para', inplace=True)
    
    print(t+'\n')
    print(df_comparison.describe())
    print('\n')

base

       unsup_coverage  sup_coverage  intersection   precision      recall
count      453.000000    453.000000    453.000000  453.000000  453.000000
mean        49.859914     29.231305     17.270216   34.561435   59.819912
std         13.326727     15.454056     11.251706   20.107712   24.554122
min         11.250000      2.564103      0.000000    0.000000    0.000000
25%         40.243902     18.000000      9.090909   19.050000   42.860000
50%         50.000000     25.714286     15.000000   31.030000   60.000000
75%         58.333333     38.095238     23.076923   47.620000   77.780000
max         88.888889    100.000000     75.000000  100.000000  100.000000


th

       unsup_coverage  sup_coverage  intersection   precision      recall
count      453.000000    453.000000    453.000000  453.000000  453.000000
mean        49.859914     63.226644     35.511742   71.746623   56.676777
std         13.326727     15.442000     13.324318   20.271281   17.780393
min         11.250000     