In [1]:
import seqsim
import pandas as pd
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import re
from prayer_leiden import *

In [2]:
book_type = 'manuscript'
json_file = open(f'{book_type}_books.json')
json_data = json.load(json_file)
titles = json_data['titles']
book_texts = json_data['book_texts']

books = list( book_texts.keys() )

Create a dataframe with all edit distances

In [3]:
columns = []
columns.append('title')
for h_id in books:
    columns.append(h_id)

rows = []
for book in tqdm(book_texts):

    row = []
    row.append(book)
    for other_book in book_texts:
        if len(book_texts[book])>5 and len(book_texts[other_book])>5:
            distance = seqsim.edit.levenshtein_dist(book_texts[book], book_texts[other_book], normal=True)
        else:
            distance = 1
        row.append(distance)
    rows.append(row)
    
df = pd.DataFrame(rows,columns=columns)
df = df.set_index('title')

100%|█████████████████████████████████████████| 272/272 [00:09<00:00, 28.85it/s]


In [None]:

for book in books:
    book_dict = df.loc[book].sort_values().to_dict()
    for other_book in book_dict:
        if book != other_book and book_dict[other_book]<0.5 and len(book_texts[book])>4:
            print(book_dict[other_book])
            print([book,other_book])
            cooccurring = intersection(book_texts[book],book_texts[other_book])
            print(f'"{titles[book]}" and "{titles[other_book]}" share {len(cooccurring)} texts.')
            for text in cooccurring:
                print(f'- {text}: {titles[str(text)]}')
            
    

In [None]:
import plotly.express as px

fig = px.imshow(df,color_continuous_scale=['#7e0327','#f2f5f3'])

fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)

fig.update_layout(width=1500,height=1500)
fig.show()


In [None]:
edges = dict()

for book in df.columns:
    row = []
    
    books_dict = df[book].to_dict()

    for i in books_dict:
        if books_dict[i]<=0.5 and i != book:
            edges[(book,i)] = books_dict[i]
            

In [None]:
books = []

for edge in edges:
    books.append(edge[0])
    books.append(edge[1])
    
books = list(set(books))

rows = []

for book in books:
    row = []
    row.append(book)
    for book2 in books:
        if book == books:
            row.append(0)
        else:
            row.append(edges.get( (book,book2) ,1))
    rows.append(row)

columns = ['book']
columns.extend(books)

df_mf = pd.DataFrame(rows,columns=columns)
df_mf = df_mf.set_index('book')

## Heatmap of the most similar books

In [None]:
fig = plt.figure( figsize=( 40,40) )
sns.heatmap(df_mf,cmap = 'Blues_r')
