In [None]:
import sys
if "../" not in sys.path:
    sys.path.append("../")

import glob
import re

import matplotlib.pyplot as plt
import pandas as pd
from numpy import arange

In [None]:
xls_source = {
    "aleph": [
        "epBooksPre1700.xlsm", "epBooks1700s.xlsm", "epBooks1800s_1.xlsm", "epBooks1800s_2.xlsm", "epBooks1800s_3.xlsm",
        "epBooks1800s_4.xlsm", "Maps.xlsm", "Music.xlsm"
    ], 
    "iams": [
        "India Office_v2.xlsx", "Map Collections_v2.xlsx", "Music Collections_v2.xlsx", "Oriental Manuscripts_v2.xlsx",
        "Philatelic Collections_v2.xlsx", "Printed Collections_v2.xlsx", "Qatar_v2.xlsx", "Sound Archive_v2.xlsx",
        "Visual Arts_v2.xlsx", "Western Manuscripts_v2.xlsx"
    ]
}

### Input stats

In [None]:
shapes = {"aleph":{}, "iams":{}}
for f in xls_source["aleph"]:
    df = pd.read_csv(f"..\\data\\interim\\{f[:-5]}.csv", usecols=[0], encoding="utf8")
    shapes["aleph"][f[:-5]] = len(df)

In [None]:
shapes["aleph"]["epBooks1800s"] = sum([shapes["aleph"][f"epBooks1800s_{x}"] for x in range(1,5)])
for x in range(1,5):
    del(shapes["aleph"][f"epBooks1800s_{x}"])

In [None]:
shapes

In [None]:
for f in xls_source["iams"]:
    df = pd.read_csv(f"..\\data\\interim\\{f[:-5]}.csv", usecols=[0], encoding="utf8")
    shapes["iams"][f[:-5]] = len(df)

In [None]:
aleph_shapes = pd.DataFrame(shapes["aleph"], index=["n_records"]).T
iams_shapes = pd.DataFrame(shapes["iams"], index=["n_records"]).T

aleph_shapes["source"] = "aleph"
iams_shapes["source"] = "iams"

In [None]:
n_records_df = pd.concat([aleph_shapes, iams_shapes])
n_records_df

In [None]:
# n_records_df.to_csv("..\\data\\processed\\set_statistics.csv")

### Visualising Results

In [None]:
n_records_df = pd.read_csv("..\\data\\processed\\set_statistics.csv", index_col=0)

In [None]:
aleph_matches = [x for x in glob.glob("..\\data\\processed\\*_matches.csv") if "v2" not in x and len(x.split("_")) == 2]  # exclude 1800s_[1234]_matches
iams_matches = [x for x in glob.glob("..\\data\\processed\\*_matches.csv") if "v2" in x]

In [None]:
aleph_matches

In [None]:
aleph_csvs = []
for f in aleph_matches:
    df = pd.read_csv(f, encoding="utf8")
    df["set"] = f.split("\\")[-1].split("_")[0]
    aleph_csvs.append(df)

aleph_matches = pd.concat(aleph_csvs)

In [None]:
iams_matches

In [None]:
iams_csvs = []
for f in iams_matches:
    df = pd.read_csv(f, encoding="utf8")
    df["set"] = f.split("\\")[-1].split("_")[0] + "_v2"
    iams_csvs.append(df)

iams_matches = pd.concat(iams_csvs)

In [None]:
aleph_matches.shape

In [None]:
iams_matches.shape

In [None]:
lexicon_df = pd.read_csv("..\\data\\external\\bl_lexicon_plural.csv")

In [None]:
norm_aleph_rec = n_records_df.query("source == 'aleph'")["n_records"].sort_values()/(n_records_df.query("source == 'aleph'")["n_records"].sum())
norm_aleph_match = (aleph_matches.groupby(by="set")["set"].count().sort_values()/len(aleph_matches)).rename("matches")
norm_aleph = pd.concat([norm_aleph_rec, norm_aleph_match], axis=1)
norm_aleph.columns = ["Fraction of records", "Fraction of audit results"]

In [None]:
x = arange(len(norm_aleph))  # the label locations
width = 0.3  # the width of the bars
multiplier = 0

aleph_num_fig, aleph_num_ax = plt.subplots(layout='constrained')

for col in norm_aleph.columns:
    offset = width * multiplier
    rects = aleph_num_ax.bar(x=x + offset, height=norm_aleph[col], width=width, label=col)
    aleph_num_ax.bar_label(rects, padding=3, fmt=lambda x: f"{x:.2}")
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
aleph_num_ax.set_ylabel('Normalised fraction')
aleph_num_ax.set_title('Fraction of results and records in each set')
aleph_num_ax.set_xticks(x + width/2, norm_aleph.index)
aleph_num_ax.legend(loc='upper left', ncols=2)
aleph_num_ax.tick_params(axis="x", rotation=45)
aleph_num_ax.set_ylim(0, 1)

In [None]:
norm_iams_rec = n_records_df.query("source == 'iams'")["n_records"].sort_values()/(n_records_df.query("source == 'iams'")["n_records"].sum())
norm_iams_match = (iams_matches.groupby(by="set")["set"].count().sort_values()/len(iams_matches)).rename("matches")
norm_iams = pd.concat([norm_iams_rec, norm_iams_match], axis=1)
norm_iams.columns = ["Fraction of records", "Fraction of audit results"]

In [None]:
iams_x = arange(len(norm_iams))  # the label locations
iams_width = 0.35  # the width of the bars
iams_multiplier = 0

iams_num_fig, iams_num_ax = plt.subplots(layout='constrained', figsize=(12,8))

for col in norm_iams.columns:
    offset = iams_width * iams_multiplier
    rects = iams_num_ax.bar(x=iams_x + offset, height=norm_iams[col], width=iams_width, label=col)
    iams_num_ax.bar_label(rects, padding=3, fmt=lambda x: f"{x:.2}")
    iams_multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
iams_num_ax.set_ylabel('Normalised fraction', fontsize="xx-large")
iams_num_ax.set_title('Fraction of results and records in each set', fontsize="xx-large")
iams_num_ax.set_xticks(iams_x + iams_width/2, norm_iams.index, ha="right", fontsize="x-large")
iams_num_ax.legend(loc='upper left', ncols=2, fontsize="x-large")
iams_num_ax.tick_params(axis="x", rotation=45, labelsize="x-large")
iams_num_ax.tick_params(axis="y", labelsize="x-large")
iams_num_ax.set_ylim(0, 0.5)

In [None]:
# iams_num_fig.savefig("..\\reports\\figures\\iams_records_matches.png", dpi=300, bbox_inches="tight")
# aleph_num_fig.savefig("..\\reports\\figures\\aleph_records_matches.png", dpi=300, bbox_inches="tight")

In [None]:
total_matches = len(aleph_matches), len(iams_matches), len(aleph_matches) + len(iams_matches)
total_matches

In [None]:
print(f"Total matches: {sum(total_matches)}")

In [None]:
aleph_term_counts = aleph_matches.groupby(by=["set", "Term"]).count()["Occurences"]
iams_term_counts = iams_matches.groupby(by=["set", "Term"]).count()["Occurences"]

In [None]:
aleph_term_counts

In [None]:
def plot_bar(group, source):
    sort = group.sort_values(ascending=False)
    pct = 100
    filter = sort[sort > (sort.sum() / pct)]
    if len(filter) < 5:
        pct = 200
        filter = sort[sort > (sort.sum() / pct)]
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.bar(x=filter.index.get_level_values(1), height=filter.values, label=f"Terms representing >{100/pct}% of all matches")
    ax.tick_params(axis="x", rotation=45)
    set = group.index.get_level_values(0)[0]
    ax.set_title(set)
    ax.set_ylabel("Count")
    xlabs, xticks = ax.get_xticklabels(), ax.get_xticks()
    ax.set_xticks(xticks, ["" for x in xlabs])
    ax.set_xlabel("Audit Terms (Redacted)")
    ax.legend()
    fig.savefig(f"../reports/figures/{source}_term_counts/{set}_redacted.png", dpi=300, bbox_inches="tight")

In [None]:
aleph_term_counts.groupby(level=0).apply(plot_bar, source="aleph")

In [None]:
iams_term_counts.groupby(level=0).apply(plot_bar, source="iams")

In [None]:
results = pd.read_csv("..\\data\\processed\\matches.csv")

In [None]:
results

In [None]:
results.groupby(by="Category")["Category"].count().plot(kind='bar')

In [None]:
results.groupby(by="Term")["Category"].count().sort_values(ascending=False).plot(kind='bar')