In [1]:
import pandas as pd
from collections import Counter
import itertools
from itertools import combinations
from tqdm import tqdm
import re
import json

def all_combinations(texts_list, n):
    all_combinations = []
    texts = combinations(texts_list, n)
    for t in texts:
        all_combinations.append(list(t))
    return(all_combinations)
        

def intersection(list1,list2):
    intersection = list(set(list1) & set(list2))
    return intersection

In [2]:
df = pd.read_csv('book_text_matrix.csv')
df = df.set_index('book_id')

titles = dict()

path = 'texts_in_books.json'
json_file = open(path)
json_data = json.load(json_file)

for book in json_data:
    titles[ book['id'] ] = book['title']
    texts = book['texts']
    for text in texts:
        titles[text['id']] = f"{text['text_id']} {text['title']}"


## Create a dictionary with books as key and list of texts as value

In [3]:
book_texts = dict()


for book,row in df.iterrows():
    texts_list = []
    for column in df.columns:
        if row[column] > 0:
            texts_list.append(column)

    book_texts[book] = texts_list


## Find all possible combinations of texts

In [4]:
found_clusters = Counter()
all_clusters = []

for book in tqdm(book_texts):

    nr_texts = len(book_texts[book])

    while nr_texts<20 and nr_texts>1:

        clusters = all_combinations(book_texts[book],nr_texts)
        all_clusters.extend(clusters)
        #print(nr_texts)
        nr_texts = nr_texts-1

all_clusters.sort()
all_clusters = list(all_clusters for all_clusters,_ in itertools.groupby(all_clusters))
print(f'{len(all_clusters)} clusters were found in total')

100%|█████████████████████████████████████████| 285/285 [00:04<00:00, 61.24it/s]


5106894 clusters were found in total


## Count frequencies of clusters

In [5]:
found_clusters = Counter()

for cluster in tqdm(all_clusters):
    for book in book_texts:
        if len(intersection(cluster,book_texts[book])) == len(cluster):
            found_clusters.update(['-'.join(cluster)])

100%|███████████████████████████████| 5106894/5106894 [28:45<00:00, 2960.31it/s]


In [6]:
found_clusters = Counter({k: c for k, c in found_clusters.items() if c > 1})

In [7]:
rows = []

for cluster,count in found_clusters.most_common():

    row = []
    nr_texts = cluster.count('-')+1
    row.append(cluster)
    row.append(nr_texts)
    row.append(count)
    rows.append(row)
    
df_clusters = pd.DataFrame(rows,columns=['cluster','nr_texts','count'])

In [8]:
largest_cluster = df_clusters['nr_texts'].max()
print(f'Largest repeated number of texts: {largest_cluster}')
df_clusters.query( f'nr_texts == {largest_cluster}')

for i,row in df_clusters.query( f'nr_texts == {largest_cluster}').iterrows():
    cluster = row['cluster']
    texts = re.split(r'-',cluster)
    for text in texts:
        print(titles[int(text)])
        
    print(f"Repeated {row['count']} times.\n\n")

Largest repeated number of texts: 14
G161 Prayer for a deceased Priest/Bishop
G162 Prayer for a deceased Man
G163 Prayer for a deceased Woman
G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
G166 Prayer for deceased Benefactors
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G168 Prayer for all deceased Faithful
G167 Prayer for the deceased fellow Brothers (Sisters), Kinsmen and Benefactors
G262 Travel Prayer
G263 Prayer for Peace
P012 Prayer for Troubled People
P013 Prayer for Good Weather
P022 Prayer for Deceased Priests
P023 Prayer for Deceased Women
Repeated 2 times.




# Most frequent clusters

In [9]:
for i, row in df_clusters.sort_values(by='count',ascending=False).iterrows():
    if row['count']>15:
        cluster = row['cluster']
        texts = re.split(r'-',cluster)
        for text in texts:
            print(titles[int(text)])
        
        print(f"Repeated {row['count']} times.\n\n")

G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G017 Prayer to be recited after the H. Communion
Repeated 23 times.


G017 Prayer to be recited after the H. Communion
G018 Prayer to Jesus to be recited after the H. Communion, usually positioned after G17
Repeated 18 times.


G048 Prayer to be recited after the H. Communion or after Mass
G072 Prayer to be recited before the H. Communion
Repeated 17 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
P001 Long Hours of the Holy Spirit
Repeated 17 times.


G013a Prayer to be recited before the H. Communion
G017 Prayer to be recited after the H. Communion
Repeated 17 times.


G048 Prayer to be recited after the H. Communion or after Mass
G019 Prayer to be recited before the H. Communion
Repeated 16 times.




## Clusters of four texts

In [10]:
for i,row in df_clusters.query( f'nr_texts == 4').sort_values(by='count',ascending=False).iterrows():
    cluster = row['cluster']
    texts = re.split(r'-',cluster)
    for text in texts:
        try:
            print(titles[int(text)])
        except:
            print('Issue')
        
    print(f"Repeated {row['count']} times.\n\n")

G161 Prayer for a deceased Priest/Bishop
G162 Prayer for a deceased Man
G163 Prayer for a deceased Woman
G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
Repeated 9 times.


G013a Prayer to be recited before the H. Communion
G013b Prayer to be recited before the H. Communion
G013c Prayer to be recited before the H. Communion
G017 Prayer to be recited after the H. Communion
Repeated 8 times.


G161 Prayer for a deceased Priest/Bishop
G162 Prayer for a deceased Man
G163 Prayer for a deceased Woman
G166 Prayer for deceased Benefactors
Repeated 8 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G013a Prayer to be recited before the H. Communion
G017 Prayer to be recited after the H. Communion
G018 Prayer to Jesus to be recited after the H. Communion, usually positioned after G17
Repeated 8 times.


G161 Prayer for a deceased Priest/Bishop
G163 Prayer for a deceased Woman
G164 Prayer for a deceased Person, on the A

P138b Prayer to Guardian Angel with versicle and collect
Repeated 3 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G229 Prayer to be recited after the H. Communion
G223 Prayer to be recited before the H. Communion
G068 Prayer to be recited before the H. Communion / Prayer to be recited during the Elevation of the Eucharist
Repeated 3 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G212 Prayer to be recited during the H. Communion / Prayer to be recited after the H. Communion
G223 Prayer to be recited before the H. Communion
G068 Prayer to be recited before the H. Communion / Prayer to be recited during the Elevation of the Eucharist
Repeated 3 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G212 Prayer to be recited during the H. Communion / Prayer to be recited after the H. Communion
G229 Prayer to be recited after the H. Communion
G068 Prayer to be r

P013 Prayer for Good Weather
Repeated 3 times.


G161 Prayer for a deceased Priest/Bishop
G168 Prayer for all deceased Faithful
G167 Prayer for the deceased fellow Brothers (Sisters), Kinsmen and Benefactors
G263 Prayer for Peace
Repeated 3 times.


G161 Prayer for a deceased Priest/Bishop
G168 Prayer for all deceased Faithful
G167 Prayer for the deceased fellow Brothers (Sisters), Kinsmen and Benefactors
P013 Prayer for Good Weather
Repeated 3 times.


G161 Prayer for a deceased Priest/Bishop
G168 Prayer for all deceased Faithful
G262 Travel Prayer
G263 Prayer for Peace
Repeated 3 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G111 Prayer to be recited before (sometimes after) the H. Communion
P445 Morning prayer by St. Bernard
P446 Evening prayer by St. Bernard
Repeated 2 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G119 Prayer to be recited after the H. Communion
G019 Prayer to be recited b

P064 Prayer to be recited before the H. Communion
P067 Prayer to be recited during the H. Communion
Repeated 2 times.


G196 Prayer to be recited during Mass
G006 Prayer to Mary (Obsecro te, in prose)
P065 Prayer to be recited before the H. Communion
P066 Prayer to be recited during the H. Communion
Repeated 2 times.


G196 Prayer to be recited during Mass
G006 Prayer to Mary (Obsecro te, in prose)
P065 Prayer to be recited before the H. Communion
P067 Prayer to be recited during the H. Communion
Repeated 2 times.


G196 Prayer to be recited during Mass
G006 Prayer to Mary (Obsecro te, in prose)
P066 Prayer to be recited during the H. Communion
P067 Prayer to be recited during the H. Communion
Repeated 2 times.


G196 Prayer to be recited during Mass
G005 Prayer to Mary (Ave Maria ancilla sancte trinitatis)
P052 Prayer to Guardian Angel
P064 Prayer to be recited before the H. Communion
Repeated 2 times.


G196 Prayer to be recited during Mass
G005 Prayer to Mary (Ave Maria ancilla sanc

G017 Prayer to be recited after the H. Communion
G018 Prayer to Jesus to be recited after the H. Communion, usually positioned after G17
Repeated 2 times.


G064 Prayer to be recited during the Elevation of the Host or during Mass / Passion Prayer
G111 Prayer to be recited before (sometimes after) the H. Communion
G040 Prayer to be recited when entering a graveyard
G062 Elevation prayer / Prayer to be recited during Mass
Repeated 2 times.


G064 Prayer to be recited during the Elevation of the Host or during Mass / Passion Prayer
G111 Prayer to be recited before (sometimes after) the H. Communion
G040 Prayer to be recited when entering a graveyard
G074 Prayer on Christ's Seven Last Words on the Cross
Repeated 2 times.


G064 Prayer to be recited during the Elevation of the Host or during Mass / Passion Prayer
G111 Prayer to be recited before (sometimes after) the H. Communion
G062 Elevation prayer / Prayer to be recited during Mass
G066 Eight Psalm Verses the Devil taught St Bernard
Re

G229 Prayer to be recited after the H. Communion
G072 Prayer to be recited before the H. Communion
G011 Prayer to be recited after (sometimes before) the H. Communion
Repeated 2 times.


G013a Prayer to be recited before the H. Communion
G013b Prayer to be recited before the H. Communion
G013c Prayer to be recited before the H. Communion
P626 Prayer to St. Matthias with versicle and collect
Repeated 2 times.


G013a Prayer to be recited before the H. Communion
G223 Prayer to be recited before the H. Communion
G068 Prayer to be recited before the H. Communion / Prayer to be recited during the Elevation of the Eucharist
G072 Prayer to be recited before the H. Communion
Repeated 2 times.


G013a Prayer to be recited before the H. Communion
G013b Prayer to be recited before the H. Communion
G018 Prayer to Jesus to be recited after the H. Communion, usually positioned after G17
G007 Prayer to Mary and John the Evangelist (O intemerata)
Repeated 2 times.


G013a Prayer to be recited before t

G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
P013 Prayer for Good Weather
P022 Prayer for Deceased Priests
Repeated 2 times.


G161 Prayer for a deceased Priest/Bishop
G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
P013 Prayer for Good Weather
P023 Prayer for Deceased Women
Repeated 2 times.


G161 Prayer for a deceased Priest/Bishop
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G167 Prayer for the deceased fellow Brothers (Sisters), Kinsmen and Benefactors
G262 Travel Prayer
Repeated 2 times.


G161 Prayer for a deceased Priest/Bishop
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G167 Prayer for the deceased fellow Brothers (Sisters), Kinsmen and Benefactors
G263 Prayer for Peace
Repeated 2 times.


G161 Prayer for a deceased Priest/Bishop
G165 Prayer for

G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G056 Elevation prayer / Prayer to be recited during Mass
G072 Prayer to be recited before the H. Communion
Repeated 2 times.


G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
P012 Prayer for Troubled People
P013 Prayer for Good Weather
Repeated 2 times.


G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
G166 Prayer for deceased Benefactors
G263 Prayer for Peace
P012 Prayer for Troubled People
Repeated 2 times.


G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
G166 Prayer for deceased Benefactors
G263 Prayer for Peace
P023 Prayer for Deceased Women
Repeated 

G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
P012 Prayer for Troubled People
P022 Prayer for Deceased Priests
Repeated 2 times.


G162 Prayer for a deceased Man
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G262 Travel Prayer
P012 Prayer for Troubled People
Repeated 2 times.


G163 Prayer for a deceased Woman
G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
P012 Prayer for Troubled People
P023 Prayer for Deceased Women
Repeated 2 times.


G163 Prayer for a deceased Woman
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G262 Travel Prayer
P012 Prayer for Troubled People
Repeated 2 times.


G163 Prayer for a deceased Woman
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G262 Travel Pra

## Clusters of three texts

In [11]:
for i,row in df_clusters.query( f'nr_texts == 3').sort_values(by='count',ascending=False).iterrows():
    cluster = row['cluster']
    texts = re.split(r'-',cluster)
    for text in texts:
        print(titles[int(text)])
        
    print(f"Repeated {row['count']} times.\n\n")

G048 Prayer to be recited after the H. Communion or after Mass
G019 Prayer to be recited before the H. Communion
G072 Prayer to be recited before the H. Communion
Repeated 13 times.


G013a Prayer to be recited before the H. Communion
G017 Prayer to be recited after the H. Communion
G018 Prayer to Jesus to be recited after the H. Communion, usually positioned after G17
Repeated 12 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G017 Prayer to be recited after the H. Communion
G018 Prayer to Jesus to be recited after the H. Communion, usually positioned after G17
Repeated 12 times.


G161 Prayer for a deceased Priest/Bishop
G162 Prayer for a deceased Man
G163 Prayer for a deceased Woman
Repeated 11 times.


G162 Prayer for a deceased Man
G163 Prayer for a deceased Woman
G164 Prayer for a deceased Person, on the Anniversary of their Death (or Funeral)
Repeated 11 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et ver

G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G105 Prayer to Mary in the Sun with Indulgence Pope Sixtus IV
G228 Prayer to Mary for a Good Death
Repeated 3 times.


P032b Prayer to Guardian Angel
G055 Elevation prayer
P160 Prayer to Any Apostle (name can be filled in)
Repeated 3 times.


P032b Prayer to Guardian Angel
G055 Elevation prayer
P110 Prayer to Eleven Thousand Virgins
Repeated 3 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G105 Prayer to Mary in the Sun with Indulgence Pope Sixtus IV
G111 Prayer to be recited before (sometimes after) the H. Communion
Repeated 3 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)
G212 Prayer to be recited during the H. Communion / Prayer to be recited after the H. Communion
G223 Prayer to be recited before the H. Communion
Repeated 3 times.


G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pon

G054 Hail Mary
G056 Elevation prayer / Prayer to be recited during Mass
Repeated 3 times.


G064bis Short prayer to Christ's wounds (collect, with reference to the Holy Sacrament)
G018 Prayer to Jesus to be recited after the H. Communion, usually positioned after G17
G056 Elevation prayer / Prayer to be recited during Mass
Repeated 3 times.


G064bis Short prayer to Christ's wounds (collect, with reference to the Holy Sacrament)
G017 Prayer to be recited after the H. Communion
G223 Prayer to be recited before the H. Communion
Repeated 3 times.


G064bis Short prayer to Christ's wounds (collect, with reference to the Holy Sacrament)
G165 Prayer for a deceased Person, on several special days after their demise or funeral (especially the 3rd, 7th or 30th day)
G056 Elevation prayer / Prayer to be recited during Mass
Repeated 3 times.


G064bis Short prayer to Christ's wounds (collect, with reference to the Holy Sacrament)
G163 Prayer for a deceased Woman
G056 Elevation prayer / Prayer to b

ValueError: invalid literal for int() with base 10: '134925.1'

## All clusters

In [15]:
titles

{125586: 'Germ. Oct. 7',
 164: 'G151 Hours of the Virgin',
 695: 'G154 Hours of the Eternal Wisdom',
 675: 'G152 Hours of the Holy Spirit',
 583: 'G155a Long Hours of the Holy Cross with prologue A',
 466: 'G156 Penitential Psalms version A',
 521: 'G158 Long Office of the Dead',
 517: 'G157 Litany of All Saints',
 125567: 'ANTWERPEN, MPM : M. 14.19',
 125656: 'G155b Long Hours of the Holy Cross with prologue B',
 125188: 'P001 Long Hours of the Holy Spirit',
 168: 'G016 Prayer to be recited before the H. Communion (Summe sacerdos et vere pontifex)',
 125393: 'HOORN, WA : Boekerij der gemeente I 45 Oct.',
 125147: 'None Computus table',
 127844: 'G072 Prayer to be recited before the H. Communion',
 127550: 'G019 Prayer to be recited before the H. Communion',
 125687: 'G048 Prayer to be recited after the H. Communion or after Mass',
 127471: 'G119 Prayer to be recited after the H. Communion',
 130068: 'P169 Hundred Articles of the Passion (Noord-Nederlandse bewerking - Hundert Betrachtu

In [30]:
out = open('clusters.txt','w',encoding='utf-8')

for nr in tqdm(range(8,1,-1)):
    
    out.write(f'Clusters of {nr} texts\n')
    
    df_filtered = df_clusters.query( f'nr_texts == {nr}')
    df_filtered = df_filtered.query( f'count > 2')
    out.write(f'{df_filtered.shape[0]} in total\n\n')

    line = 0
    for i,row in df_filtered.sort_values(by='count',ascending=False).iterrows():
        line += 1
        cluster = row['cluster']
        texts = re.split(r'-',cluster)
        out.write(f'-- Cluster of {nr} texts: number {line}/{df_filtered.shape[0]} --\n')
        for text in texts:
            out.write('\n'+titles[int(float(text))])

        out.write(f"\nRepeated {row['count']} times.\n\n")



100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 43.25it/s]
