In [1]:
import pandas as pd
from collections import Counter
import itertools
from os.path import join
from itertools import combinations
from tqdm import tqdm
import re
import json

def all_combinations(texts_list, n):
    all_combinations = []
    texts = combinations(texts_list, n)
    for t in texts:
        all_combinations.append(list(t))
    return(all_combinations)
        
def intersection(list1,list2):
    intersection = list(set(list1) & set(list2))
    return intersection

def deduplicate_list(l):
    l.sort()
    return list(l for l,_ in itertools.groupby(l))

def remove_subsets(clusters):
    return [x for x in clusters if not any(set(x)<=set(y) for y in clusters if x is not y)]

def find_text_frequency(text,book_texts):
    freq = 0
    for book in book_texts:
        if text in book_texts[book]:
            freq += 1
    return freq

def find_books_containing_cluster(cluster,book_texts):
    books = []
    for book in book_texts:
        texts = book_texts[book]
        if len(intersection(texts,cluster))==len(cluster):
            books.append(book)
    return books


In [2]:
path = 'texts_in_books.json'
json_file = open(path)
json_data = json.load(json_file)

In [3]:
texts_to_exclude = [ 'G151','G152','G153','G153a','G154','G155a','G155b',
                    'G155c','G155d','G155e','G156','G156b','G157','G158',
                    'G158b' ] 

## Exclude texts without identification like Calendars

## Merge following Heurist IDs

merge_hids = {
    
    544:540,
    547:540,
    549:540,
    550:540,
    135517:540,

    132476:132477,
    135524:132477,
    137964:132477,
    135511:132477
}

In [4]:
titles = dict()
book_texts = dict()

for book in json_data:
    
    book_id = int(book['id'])


    if 'title' in book:
        titles[ book_id ] = book['title']
    elif 'shelfmark' in book:
        titles[ book_id ] = book['shelfmark']
    else:
        titles[ book_id ] = '[Untitled]'
        
    texts = book['texts']
    
    all_texts = []
    for text in texts:
        
        if 'text_id' in text and 'prayer_id' in text:
            if text['prayer_id'] not in texts_to_exclude and len(text['prayer_id'].strip()) > 0:
                text_id = int(text['text_id'])

                
                if text_id in merge_hids:
                    text_id = merge_hids[text_id]

                all_texts.append(text_id)                
                if 'title' in text and re.search(r'\d',str(text_id)):
                    titles[text_id] = f"{text['prayer_id']}: {text['title']}"
            
    if len(all_texts)>0:
        book_texts[book_id] = all_texts
    
## Manual assignment of merged titles
titles[540] = "G004: Prayer of St. Gregory to the Arma Christi"
titles[132477] = "G155_G189c: Long Hours of the Holy Cross with prologue B interwoven with Prayer on Mary's Compassion "

## Most frequent texts

In [5]:
text_freq = Counter()

for book in book_texts:
    texts = book_texts[book]
    text_freq.update(texts)
    
corpus = text_freq.keys()
corpus = [text for text in corpus if text_freq[text]>1 ]

for text,count in text_freq.most_common(10):
    print(f'{titles[text]} ({text}): in {count} books')

G016: Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex) (168): in 86 books
G004: Prayer of St. Gregory to the Arma Christi (540): in 81 books
P001: Long Hours of the Holy Spirit (125188): in 66 books
G056: Elevation prayer / Prayer to be recited during Mass (127675): in 59 books
G048: Prayer to be recited after the H. Communion or after Mass (125687): in 54 books
G017: Prayer to be recited after the H. Communion (127529): in 52 books
G055: Elevation prayer (127674): in 44 books
G007: Prayer to Mary and John the Evangelist (O intemerata) (127681): in 41 books
P169: Hundred Articles of the Passion (Noord-Nederlandse bewerking - Hundert Betrachtungen und Begehrungen) (130068): in 40 books
G111: Prayer to be recited before (sometimes after) the H. Communion (127464): in 38 books


In [6]:
print(f'{len(book_texts)} books')
print(f'{len(corpus)} texts')

289 books
507 texts


In [7]:
def find_cooccurrences(cluster,book_texts):
    cooccurrences_freq = Counter()
    for book in book_texts:
        texts = book_texts[book]
        if len(intersection(cluster,texts)) == len(cluster):
            cooccurrences = [other_text for other_text in texts if other_text not in cluster]
            cooccurrences_freq.update(cooccurrences)
    return remove_single_cooccurrences(cooccurrences_freq)

def remove_single_cooccurrences(freq):
    new_freq = Counter()
    for i,count in freq.most_common():
        if count>1 and text_freq[i]>1:
            new_freq[i]=count
    return new_freq

In [8]:
# ## make dataframe; of no use at the moment

# rows = []
# for book in book_texts:
#     row = []
#     row.append(book)
#     texts = book_texts[book]
#     for text in corpus:
#         if text in texts:
#             row.append(1)
#         else:
#             row.append(0)
#     rows.append(row)
    
# columns = ['book']
# columns.extend(corpus)
# #print(columns)
# corpus_df = pd.DataFrame(rows,columns=columns)
# corpus_df = corpus_df.set_index('book')

## All clusters of four texts

In [9]:
all_clusters = []

## Clusters of four

for book in tqdm(book_texts):
    texts_list = book_texts[book]
    clusters = all_combinations(texts_list, 4)
    all_clusters.extend(clusters)
    
print(len(all_clusters))
all_clusters = deduplicate_list(all_clusters)
print(len(all_clusters)) 

100%|█████████████████████████████████████████| 289/289 [00:11<00:00, 25.80it/s]


19134745
17046710


## Most common clusters of five texts

In [10]:
new_clusters = []

for selected_text in tqdm(corpus):
#for selected_text in corpus:
    #print(selected_text)

    cluster = [selected_text]
    cooccurrences_counter = find_cooccurrences(cluster,book_texts)
    cooccurrences = [text for text,count in cooccurrences_counter.most_common(30)]

    clusters = all_combinations(cooccurrences,4)
    for new_cluster in clusters:
        new_cluster.extend(cluster)
        new_clusters.append(new_cluster)
        

100%|█████████████████████████████████████████| 507/507 [00:16<00:00, 29.90it/s]


In [11]:
new_clusters = deduplicate_list(new_clusters)

for new_cluster in tqdm(new_clusters):
        
    books = find_books_containing_cluster(new_cluster,book_texts)
    nr_books = len(books)
    ## Clusters need to occur in at least two books
    if nr_books >= 2:        
        new_cluster = sorted(new_cluster)
        all_clusters.append(new_cluster)

print(f"{len(new_clusters)} clusters were found.")

100%|███████████████████████████████| 4900312/4900312 [31:54<00:00, 2559.25it/s]

4900312 clusters were found.





## Clusters of more than five texts

In [None]:
for selected_text in tqdm(corpus):

    cluster = [selected_text]
    cooccurrences_counter = find_cooccurrences(cluster,book_texts)
    #print(cooccurrences_counter)

    for nr_texts in range(1,len(cooccurrences_counter)):
        cluster = [selected_text]
        for text,count in cooccurrences_counter.most_common(nr_texts):
            cluster.append(text)

        books = find_books_containing_cluster(cluster,book_texts)
        nr_books = len(books)

        if nr_books == 0:
            break

    cluster = sorted(cluster[:-1])
    if cluster not in all_clusters:
        if len(cluster)>1:
            books = find_books_containing_cluster(cluster,book_texts)
            if len(books)>1:
                all_clusters.append(sorted(cluster))
                
all_clusters = deduplicate_list(all_clusters)

 72%|█████████████████████████████▋           | 367/507 [11:11<04:12,  1.81s/it]

In [None]:
print(f'{len(all_clusters)} clusters were found')

In [None]:
rows = []
for i,c in enumerate(all_clusters):
    row = []
    row.append(i)
    row.append(len(c))
    books = find_books_containing_cluster(c,book_texts)
    row.append(len(books))
    rows.append(row)
    
clusters_df = pd.DataFrame(rows,columns=['id','nr_texts','nr_books'])

In [None]:
clusters_df.to_csv('clusters.csv')

with open('clusters_list.py','w') as out:
    out.write(f'all_clusters = {all_clusters}')

In [None]:
clusters_df = pd.read_csv('clusters.csv')
clusters_df.query('nr_texts < 15')
all_clusters[13]

## Test


    G004
    G091
    G108
    G153a_G189c

Ik heb inmiddels twee voorbeelden gevonden waar je hopelijk iets mee kunt. De combinatie van één van de vormen van G004, samen met G091, G108 en G153a_G189c komt voor in 12079, Inv. 291 en Douce 243, maar die cluster van 4 teksten zie ik niet in de lijst. In 12079 en Douce 245 komt G005 bijvoorbeeld ook nog voor.

In boeken:

    12079 (h_id 125129)
    Inv. 291 (h_id 137852 )
    Douce 243

Een vorm van G155a-e_G189c, samen met een vorm van G004 en bijvoorbeeld G072 en P001 komt voor in Cgm 76 en in BC MS 7, maar die cluster van 4 teksten zie ik ook niet. Het is wat ingewikkeld om de nrs. G155a-e_G189c handmatig te doorzoeken, maar misschien kun je wel achterhalen waarom dergelijke clusters er niet uit komen.

    G153a_G189c
    G005

G153a_G189c nu helemaal is verdwenen in de lijst. Dat is een tekst die niet samengevoegd hoeft te worden met de G155-nummers_G189c, maar staat op zichzelf. G005 komt nu bijvoorbeeld ook helemaal niet voor, terwijl die wel in een cluster zou moeten zitten.


In [None]:
cluster = [127443,127462,134925,540] 
for text in sorted(cluster):
    print(titles[text])
    
print(sorted(cluster) in all_clusters)

In [None]:
books = find_books_containing_cluster(cluster,book_texts)

for book in books:
    print(book)
    print(titles[book])
    print(sorted(book_texts[book]))

In [None]:
cluster = [127844,125188,540,132477]
for text in sorted(cluster):
    print(titles[text])
    
print(sorted(cluster) in all_clusters)

In [None]:
books = find_books_containing_cluster(cluster,book_texts)

for book in books:
    print(titles[book])
    print(sorted(book_texts[book]))