In [1]:
## choose 'print' or 'manuscript'
book_type = 'manuscript'

In [2]:
import json
from collections import Counter
from tqdm import tqdm
import pandas as pd
import math
from prayer_leiden import *

import itertools
from itertools import combinations


In [3]:
## All books
json_file = open(f'{book_type}_books.json')
json_data = json.load(json_file)
titles = json_data['titles']
book_texts = json_data['book_texts']

## Frequencies of texts

In [4]:
total_nr_texts = 0

# total number of texts
text_freq = Counter()

for book in book_texts:
    texts = book_texts[book]
    text_freq.update(texts)

print(f"{len(book_texts)} books")
print(f"{len(text_freq)} unique texts")

272 books
1459 unique texts


In [5]:
print(f"{sum(text_freq.values())} witnesses/expressions in total")
corpus = []

for text,count in text_freq.most_common():
    if count>1:
        corpus.append(text)
        
total_nr_texts = len(corpus)
print(f"{total_nr_texts} of texts which have a frequency of 2 or more")

4085 witnesses/expressions in total
512 of texts which have a frequency of 2 or more


In [6]:
print(f'{len(text_freq)-total_nr_texts} texts occur only once.')

947 texts occur only once.


## Co-occurrence matrix

In [7]:

rows = []

for text in tqdm(corpus):
    row = []
    row.append(text)
    
    for text2 in corpus:
        nr_cooccurrences = 0
        for book in book_texts:
            texts = book_texts[book]
            if text in texts and text2 in texts:
                nr_cooccurrences += 1
        row.append(nr_cooccurrences/total_nr_texts)
    rows.append(row)
    
columns = ['text']
columns.extend(corpus)

joint_probability  = pd.DataFrame(rows,columns=columns)
joint_probability = joint_probability.set_index('text')


100%|█████████████████████████████████████████| 512/512 [00:22<00:00, 22.94it/s]


## Text probabilities

In [8]:
probability = dict()

for text in corpus:
    probability[text] = text_freq[text]/total_nr_texts

## Pointwise Mutual Information

In [9]:
rows = []

for text1 in tqdm(corpus):
    row = []
    row.append(text1)
    
    for text2 in corpus:
        jp_texts = joint_probability.loc[text1][text2]
        jp_texts = math.pow(jp_texts, 1.7)
        if (jp_texts/(probability[text1]*probability[text2]))>0:
            pmi = math.log2((jp_texts/(probability[text1]*probability[text2])))
        else:
            pmi = 0
        pmi = (jp_texts/(probability[text1]*probability[text2]))

        row.append(pmi)
    rows.append(row)
    
columns = ['text']
columns.extend(corpus)

pmi_df = pd.DataFrame(rows,columns=columns)
pmi_df = pmi_df.set_index('text')
        

100%|█████████████████████████████████████████| 512/512 [00:11<00:00, 43.89it/s]


In [None]:
max_cluster_length = 15

all_clusters = []

for text in tqdm(corpus):
    pmi_dict = pmi_df.loc[text].sort_values(ascending=False).to_dict()
    
    texts = []
    for high_association in pmi_dict:
        if pmi_dict[high_association] > 0:
            texts.append(high_association)
            
    if len(texts)>max_cluster_length:
        texts = texts[:max_cluster_length]

    nr_texts = len(texts)

    while nr_texts>2:

        clusters = all_combinations(texts,nr_texts)
        all_clusters.extend(clusters)
        all_clusters = deduplicate_list(all_clusters)
        #all_clusters = remove_subsets(all_clusters)
        nr_texts = nr_texts-1


 41%|████████████████▏                      | 212/512 [31:20<1:29:23, 17.88s/it]

In [None]:
len(all_clusters)

In [None]:
#all_clusters2 = remove_subsets(all_clusters)

In [None]:
with open(f'all_clusters_{book_type}.csv','w',encoding='utf-8') as out:
    
    out.write('cluster,length,frequency\n')

    for cluster in tqdm(all_clusters):
        #print(cluster)
        row = dict()
        books = find_books_containing_cluster(cluster,book_texts)
        row['cluster'] = ' '.join(str(x) for x in cluster)
        row['length'] = len(cluster)
        row['frequency'] = len(books)
        if row['frequency']>=2:
            out.write(f"{row['cluster']},{row['length']},{row['frequency']}\n")
