# Visualize matches

This should probably be split to its own file, but here for dev purposes

## Setup

In [1]:
import pandas as pd 
import re
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import plotly.express as px
import plotly.io as pio



**pd.set_option('display.max_rows', None)**: Sets the maximum number of rows displayed when a pandas DataFrame or Series is printed to be unlimited. When this option is set to an integer (as in the commented line # **pd.set_option('display.max_rows', 10)**), only the specified number of rows would be displayed. This is useful for controlling the output length, especially when working with large DataFrames.

In [2]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_rows', 10)

### Load Terms List

In [3]:
term_list_file = 'terms_all.txt'

with open(term_list_file, 'r') as f:
    terms = [line.strip() for line in f]

### Load Finding Aid Data

When running in a separate notebook, you will need to load the data.
In this case, the "matched terms" results from the `match_terms.ipynb` notebook will suffice:

In [4]:
eads_df = pd.read_csv('matched_results_Bentley.csv', encoding='utf-8')

### Load Functions

In [5]:
# match term

def match_terms(row, terms, columns):
    results = []
    for term in terms:
        for col in columns:
            if not isinstance(row[col], float):
                # split the column into paragraphs
                # wonky try/except to work through integers, if not converted to strings
                try:
                    paragraphs = row[col].split('\n')
                except:
                    paragraphs = str(row[col]).split('\n')
                # loop through each paragraph
                for paragraph in paragraphs:
                    # check if the term is in the current paragraph
                    if re.search(r'\b' + re.escape(term) + r'\b', paragraph, re.IGNORECASE):
                        results.append({
                            'ead_id': row['ead_id'],
#                            'source_filename': row['source_filename'],
                            'resource_id': row['resource_id'],
                            'titleproper': row.get('titleproper', None), 
                            'Term': term, 
                            'Matched_Times': len(re.findall(r'\b' + re.escape(term) + r'\b', paragraph, re.IGNORECASE)),
                            'Matched_From': col, 
                            'Matched_Paragraph': paragraph
                        })
    return results

In [6]:
def match_and_visualize(df, name):
    # Match results
    results_df = pd.DataFrame([result for index, row in df.iterrows() for result in match_terms(row, terms, df.columns)])
    
    # Show matched results
    print('Matched results for', name)
    display(results_df) # control dislay

    # Export to CSV
    results_df.to_csv('matched_results_' + name + '.csv', index=True)
    return results_df  # Return the DataFrame for later use

In [10]:
def calculate_term_frequency(df, df_name):
    term_frequency = df.groupby('Term')['Matched_Times'].sum().reset_index()
    term_frequency.rename(columns={'Matched_Times': 'Total_Frequency'}, inplace=True)
    term_frequency['DataFrame'] = df_name

    # Sort in descending order
    term_frequency.sort_values(by='Total_Frequency', ascending=False, inplace=True)
    
    # Show frequency table
    print('Term frequency for', df_name)
    display(term_frequency)

    return term_frequency

## Visualization values

Use the below to set global values, such as the names of repositories
under analysis, or fonts and colors for charts. 

In [7]:
# set names and colors here
repo_list = ['Bentley']
#repo_list = ['Bentley','Clements','SCRC']
global_font_info = {'font_family':'Georgia'}
colors = {'Bentley': '#CFC096', 'Clements': '#A5A508', 'SCRC': '#FFCB05'}

Now, tally up term matches!

In [8]:
# create a manifest to select which result lists you want to visualize
match_groups = [(eads_df, 'Bentley')]
matched_results = {name: match_and_visualize(df, name) for df, name in match_groups}

Matched results for Bentley


Unnamed: 0,ead_id,resource_id,titleproper,Term,Matched_Times,Matched_From,Matched_Paragraph
0,umich-bhl-0336,1051,"Grant Kohn Goodman papers, 1943-1995",Death,1,Term,Death
1,umich-bhl-0336,1051,"Grant Kohn Goodman papers, 1943-1995",Death,1,Matched_Paragraph,Death
2,umich-bhl-0336,1051,"Grant Kohn Goodman papers, 1943-1995",Death,1,Term,Death
3,umich-bhl-0336,1051,"Grant Kohn Goodman papers, 1943-1995",Death,1,Matched_Paragraph,Goodman worked as a professor of Japanese hist...
4,umich-bhl-0375,2620,Museum of Anthropological Archaeology (Univers...,Native,1,Term,Native
...,...,...,...,...,...,...,...
917,umich-bhl-9940,295,"James J. Blanchard Papers, 1982-2002",Remains,2,Matched_Paragraph,The Personnel Division was charged with filli...
918,umich-bhl-9961,1400,Frank R. Kennedy Papers,Death,1,Term,Death
919,umich-bhl-9961,1400,Frank R. Kennedy Papers,Death,1,Matched_Paragraph,Death
920,umich-bhl-9961,1400,Frank R. Kennedy Papers,Death,1,Term,Death


## Visualize group matches

In the use case at U Michigan, the groups represented finding aids
created by different repositories on campus. This can be useful for
working with subgroups of finding aids, or in comparing usage between
different organizations.

In [11]:
# bar charts for individual groups

def visualize_individual_repository_terms_bar(list_of_dataframes):
    '''
    This function takes a list of strings (list_of_dataframes), which identify the names of dataframes (corresponding to an archival repository),
    then generates a bar chart to show how many times a list of requested terms appears in the corresponding finding aid documents.
    Requires the calculate_term_frequency function.
    Processes data using pandas and generates charts using plotly (px) & plotly express (pio).
    '''
    for i in range(len(list_of_dataframes)):
        # calculate term frequency
        term_frequency = calculate_term_frequency(matched_results[repo_list[i]], repo_list[i])
        term_frequency.head()

        # Visualization in Multiple Charts
        fig = px.bar(term_frequency, x='Term', y='Total_Frequency', text='Total_Frequency', 
                        color='DataFrame', color_discrete_map=colors, text_auto=True,
                        labels={'Total_Frequency':'Term Occurence Count','DataFrame':'Repository'})
        fig.update_traces(textposition='outside', insidetextanchor='middle',
                          textfont=dict(family='Arial', size=8))
        fig.update_layout(title_text=f'Term Frequency for { repo_list[i] }', 
                          xaxis_title_standoff=10, 
                          showlegend=False, font_family=global_font_info['font_family'])
        fig.update_xaxes(tickangle=45, tickfont=dict(family='Arial', color='black', size=10))
        fig.update_yaxes(tickfont=dict(family='Arial', size=10))
        fig.show()
        pio.write_image(fig, f'term_frequency_byrepo_{ repo_list[i] }.png', width=700)

visualize_individual_repository_terms_bar(repo_list)

Term frequency for Bentley


Unnamed: 0,Term,Total_Frequency,DataFrame
5,Death,250,Bentley
19,Philippine Islands,136,Bentley
17,Native,89,Bentley
10,Igorot,88,Bentley
6,Dwellings,71,Bentley
...,...,...,...
24,Slavery,4,Bentley
20,Plantations,4,Bentley
8,Hanging,4,Bentley
16,Moros,4,Bentley


The above shows a basic bar chart, ordered from most frequent
to least frequent term. If looking at multiple groupings, the above
function will create multiple bar charts. 

The following sections demonstrate additional visualizations, for a single grouping
of finding aid data. For the case of multiple groups (or, in our case repositories)
see the additional visualization notebooks.

## Visualization: occurence in a horizontal bar chart

In [12]:
# bar charts for individual repos - horizontal

def visualize_individual_repository_terms_bar_horiz(list_of_dataframes):
    '''
    This function takes a list of strings (list_of_dataframes), which identify the names of dataframes (corresponding to an archival repository),
    then generates a horizontal bar chart to show how many times a list of requested terms appears in the corresponding finding aid documents.
    Requires the calculate_term_frequency function.
    Processes data using pandas and generates charts using plotly (px) & plotly express (pio).
    '''
    for i in range(len(list_of_dataframes)):
        # calculate term frequency
        term_frequency = calculate_term_frequency(matched_results[repo_list[i]], repo_list[i]).sort_values('Total_Frequency', ascending=True)
        term_frequency.head()

        # Visualization in Charts Horizontallly
        fig = px.bar(term_frequency, y='Term', x='Total_Frequency', text='Total_Frequency',
                        color='DataFrame', color_discrete_map=colors, text_auto=True,
                        labels={'Total_Frequency':'Term Occurence Count','DataFrame':'Repository'})
        fig.update_traces(textposition='outside', insidetextanchor='middle', 
                          textfont=dict(family='Arial',size=8))
        fig.update_layout(title_text=f'Term Frequency for { repo_list[i] }', 
                          xaxis_title_standoff=10, height=600, 
                          showlegend=False, font_family=global_font_info['font_family'])
        fig.update_yaxes(tickfont=dict(family='Arial', size=10))
        fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=10))
        fig.show()
        pio.write_image(fig, f'term_frequency_byrepo_{ repo_list[i] }_horizbar.png', scale=4)

visualize_individual_repository_terms_bar_horiz(repo_list)

Term frequency for Bentley


Unnamed: 0,Term,Total_Frequency,DataFrame
5,Death,250,Bentley
19,Philippine Islands,136,Bentley
17,Native,89,Bentley
10,Igorot,88,Bentley
6,Dwellings,71,Bentley
...,...,...,...
24,Slavery,4,Bentley
20,Plantations,4,Bentley
8,Hanging,4,Bentley
16,Moros,4,Bentley


The horizontal bar chart seems more well-suited to displaying occurences
of the terms, since the terms are easier to read in this orientation.

## Visualization: Occurence by EAD tag/element

This section of the script helps identify the EAD (Encoded Archival Description) elements with the highest occurrences of the specified harmful terms.

The function `calculate_element_frequency(df, df_name)` is used to calculate the sum of matched terms for each subsection in a DataFrame. The results are sorted in descending order and returned as a DataFrame with columns `Subsection`, `harmful_terms_frequency`, and `Source`.

This function is applied to each DataFrame in matched_results, and the results are concatenated into a single DataFrame, `all_element_frequencies`.

Finally, a grouped bar chart is created to visualize the frequency of terms across different subsections and sources.

In the chart, the x-axis represents the subsections, the y-axis shows the frequency of harmful terms, and different colors distinguish between sources. The `barmode='group'` setting places the bars side by side for easier comparison between sources.

In [15]:
def calculate_element_frequency(df, df_name):
    element_counts = df.groupby('Matched_From')['Matched_Times'].sum()
    element_counts_sorted = element_counts.sort_values(ascending=False)
    df_element_counts = pd.DataFrame(list(element_counts_sorted.items()), columns=['Subsection', 'harmful_terms_frequency'])
    df_element_counts['Source'] = df_name  # Indicate the source dataframe
    return df_element_counts

# Use the function for each dataframes
element_frequencies_list = [calculate_element_frequency(df, name) for name, df in matched_results.items()]

# Concatenate all element frequencies
all_element_frequencies = pd.concat(element_frequencies_list)

# Show the DataFrame
print("Element frequencies across all file pools:")
display(all_element_frequencies)

# Set colors
colors = {'Bentley': '#CFC096', 'Clements': '#A5A508', 'SCRC': '#FFCB05'}

# Visualization
fig = px.bar(all_element_frequencies, x='Subsection', y='harmful_terms_frequency', color='Source', 
             text='harmful_terms_frequency', barmode='group',
             labels={'Subsection':'EAD Tag','Source':'Repository'},
             color_discrete_map=colors)
fig.update_traces(textposition='outside', textfont=dict(family='Arial',size=8))
fig.update_layout(title_text="Term Occurence Frequency by EAD Tag", yaxis_title="Term Frequency", height=600,
                  font_family=global_font_info['font_family'])
fig.update_yaxes(tickfont=dict(family='Arial', size=10))
fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=10))
fig.show()
pio.write_image(fig, 'element_frequency_across_dfs.png', scale=5)

# Visualization Variation: Stacked bars
fig = px.bar(all_element_frequencies, x='Subsection', y='harmful_terms_frequency', color='Source', 
             text='harmful_terms_frequency', barmode='stack',
             labels={'Subsection':'EAD Tag','Source':'Repository'},
             color_discrete_map=colors)
fig.update_traces(textposition='inside', insidetextanchor='middle', textfont=dict(family='Arial',size=8))
fig.update_layout(title_text="Term Occurence Frequency by EAD Tag (Stacked Totals)", yaxis_title="Term Frequency", height=600,
                  font_family=global_font_info['font_family'])
fig.update_yaxes(tickfont=dict(family='Arial', size=10))
fig.update_xaxes(tickangle=0, tickfont=dict(family='Arial', color='black', size=10))
fig.show()
pio.write_image(fig, 'element_frequency_across_dfs_stacked.png', scale=5)

Element frequencies across all file pools:


Unnamed: 0,Subsection,harmful_terms_frequency,Source
0,Matched_Paragraph,554,Bentley
1,Term,412,Bentley
