# Data visualizations

To create the data visualizations I had to extract from the tables the values of some variables, especially the count of the values inserted in the different categories, for example archive, library, place, shape, support and type, etc.

In [6]:
import json
import re
import pandas as pd
from collections import Counter

# Path to your file conta/Users/martinapensalfini/Desktop/gadda/mergecolumntipo.jsonining the JSON-like data
file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Pattern to match values after "archivio:" and before "/"
pattern = r'archivio:\s(.*?)(?= \/)'

# Extracting values matching the pattern from the "Archival Description" field of each item
matches = []
for item in json_data:
    archival_description = item.get("Archival Description", "")
    match = re.search(pattern, archival_description)
    if match:
        matches.append(match.group(1))

# If no matches were found
if not matches:
    print("No matches found for the pattern 'archivio:' before '/'")
else:
    # Counting occurrences of each value
    value_counts = Counter(matches)

    # Create a DataFrame from the value counts
    df = pd.DataFrame(value_counts.items(), columns=['Value', 'Count'])

    # Display the DataFrame
    print(df)


                                    Value  Count
0                       Archivio Bonsanti     49
1  Archivio Biblioteca Nazionale Centrale     12
2                       Archivio Garzanti     89
3                          Archivio Gelli     47
4                       Archivio Liberati    585
5             Archivio Centro Manoscritti      4
6         Archivio Biblioteca Trivulziana    797


And then each result was saved as an Excel spreadsheet for commodity as this was the format required by *Fluorish* to later create the data visualizations.

In [16]:
df.to_excel("archiviografico.xlsx")

An interesting example was also the fact that aside from execuing a simple count of values, I also counted the count of a certain category (in this case *thematic cards*) per archive.

In [19]:
import json
import re
import pandas as pd
from collections import defaultdict, Counter


file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Create a defaultdict to store values for 'schede tematiche' grouped by 'archivio'
archivio_schede = defaultdict(list)

# Pattern to match values after "schede tematiche:" until the new line
pattern = r'schede tematiche:\s(.*?)(?=\n)'

# Extracting values for 'schede tematiche' and grouping them by 'archivio'
for item in json_data:
    archivio = re.search(r'archivio:\s(.*?)(?= \/)', item.get("Archival Description", "")).group(1)
    schede_match = re.findall(pattern, item.get("Internal Description", ""))
    for schede in schede_match:
        values = schede.split(', ')
        archivio_schede[archivio].extend(values)

# Counting occurrences of each value for 'schede tematiche' grouped by 'archivio'
archivio_counts = {archivio: Counter(values) for archivio, values in archivio_schede.items()}

# Create a list to store DataFrame rows
data = []

# Populate the list with rows containing 'Value', 'Archivio', and 'Count'
for archivio, counts in archivio_counts.items():
    for value, count in counts.items():
        data.append({'Value': value, 'Archivio': archivio, 'Count': count})

# Create a DataFrame from the list of rows
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


            Value                         Archivio  Count
0          AppUni                Archivio Bonsanti      9
1         QuaScol                Archivio Bonsanti     12
2         AppGue                 Archivio Bonsanti      1
3      AppLetTed                 Archivio Bonsanti      6
4           Cont                 Archivio Bonsanti      7
..            ...                              ...    ...
244        TestIn  Archivio Biblioteca Trivulziana      5
245  LetTrinnanzi  Archivio Biblioteca Trivulziana      4
246      Opere IV  Archivio Biblioteca Trivulziana      6
247    LetBassani  Archivio Biblioteca Trivulziana      2
248           Let  Archivio Biblioteca Trivulziana     42

[249 rows x 3 columns]


In [20]:
df.to_excel("schedetematiche.xlsx")

In [28]:
import json
import re
import pandas as pd
from collections import defaultdict, Counter


file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Create a defaultdict to store values for 'luogo' grouped by 'archivio'
archivio_luogo = defaultdict(list)

# Pattern to match values after "luogo:" until the new line
pattern = r'luogo:\s(.*?)(?=\n)'

# Extracting values for 'luogo' and grouping them by 'archivio'
for item in json_data:
    archivio = re.search(r'archivio:\s(.*?)(?= \/)', item.get("Internal Description", "")).group(1)
    luogo_match = re.findall(pattern, item.get("Internal Description", ""))
    archivio_luogo[archivio].extend(luogo_match)

# Counting occurrences of each value for 'luogo' grouped by 'archivio'
archivio_counts = {archivio: Counter(values) for archivio, values in archivio_luogo.items()}

# Create a list to store DataFrame rows
data = []

# Populate the list with rows containing 'Value', 'Archivio', and 'Count'
for archivio, counts in archivio_counts.items():
    for value, count in counts.items():
        data.append({'Value': value, 'Archivio': archivio, 'Count': count})

# Create a DataFrame from the list of rows
df = pd.DataFrame(data)


print(df)


AttributeError: 'NoneType' object has no attribute 'group'

In [22]:
df.to_excel("luogoarchivio.xlsx")

In [29]:
import json
import re
import pandas as pd
from collections import defaultdict, Counter


file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Create a defaultdict to store values for 'schede tematiche' grouped by 'library'
library_scheduled = defaultdict(list)

# Pattern to match values after "library:" until the new line
pattern = r'library:\s(.*?)(?=\n|$)'

# Extracting values for 'schede tematiche' and grouping them by 'library'
for item in json_data:
    library_match = re.search(pattern, item.get("Internal Description", ""))
    if library_match:
        library_values = library_match.group(1).split(', ')
        scheduled_match = re.findall(pattern, item.get("Internal Description", ""))
        for library in library_values:
            for scheduled in scheduled_match:
                values = scheduled.split(', ')
                library_scheduled[library].extend(values)

# Counting occurrences of each value for 'schede tematiche' grouped by 'library'
library_counts = {library: Counter(values) for library, values in library_scheduled.items()}

# Create a list to store DataFrame rows
data = []

# Populate the list with rows containing 'Value', 'Library', and 'Count'
for library, counts in library_counts.items():
    for value, count in counts.items():
        data.append({'Value': value, 'Library': library, 'Count': count})

# Create a DataFrame from the list of rows
df = pd.DataFrame(data)

print(df)


        Value   Library  Count
0           M         M     21
1         RaI         M      1
2           T         M      1
3          GL        GL      8
4         GGP       GGP     15
..        ...       ...    ...
136  Opere I   Opere I       1
137       QPL       QPL      8
138       LiM       LiM      1
139  Opere IV  Opere IV      6
140      NoID      NoID      1

[141 rows x 3 columns]


In [30]:
df.to_excel("archiviobiblioteca.xlsx")

In [33]:
import json
import re
import pandas as pd
from collections import Counter

# Path to your file containing the JSON-like data
file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Pattern to match values after "luogo:" within the "Internal Description" field
pattern = r'luogo:\s(.*?)(?:\n|$)'

# Extracting values matching the pattern from the "Internal Description" field of each item
matches = []
for item in json_data:
    internal_description = item.get("Internal Description", "")
    match = re.search(pattern, internal_description)
    if match:
        matches.append(match.group(1))

# If no matches were found
if not matches:
    print("No matches found for the pattern 'luogo:' in the 'Internal Description' field.")
else:
    # Counting occurrences of each value
    value_counts = Counter(matches)

    # Create a list to store DataFrame rows
    data = [{'Value': value, 'Count': count} for value, count in value_counts.items()]

    # Create a DataFrame from the list of rows
    df = pd.DataFrame(data)

    print(df)


                     Value  Count
0                    Celle      4
1                   Milano    100
2                     Roma    101
3                   Genova      7
4             Buenos Aires     11
5                  Firenze     35
6                    Siena      1
7                  Venezia     11
8                    Parma      2
9                    Capri      1
10                 Bergamo      9
11                 Chianti      1
12           Isola di Rodi      1
13                    Zara      1
14                 Tripoli      4
15    S. Margherita Ligure      1
16                 Longone     16
17                  Stresa      1
18            Cavalcaselle      1
19                Sirmione      1
20  S.ta Margherita Ligure      1
21       Cortina d’Ampezzo      1
22                   Pocol      1


In [None]:
df.to_excel("luoghigrafico.xlsx")

In this specific case, I also had to keep separated the values for the thematic cards regarding the themes and the works. And in some cases I actually operated a manual action on them as it was far more precise and useful.

In [9]:
import json
import re
import pandas as pd
from collections import defaultdict, Counter

# Path to your file containing the JSON-like data
file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Create a defaultdict to store values for 'schede tematiche' grouped by 'archivio'
archivio_schede = defaultdict(list)

# Pattern to match values after "schede tematiche:" until the new line
pattern = r'schede tematiche:\s(.*?)(?=\n)'

# Extracting values for 'schede tematiche' and grouping them by 'archivio'
for item in json_data:
    archivio = re.search(r'archivio:\s(.*?)(?= \/)', item.get("Archival Description", "")).group(1)
    schede_match = re.findall(pattern, item.get("Internal Description", ""))
    for schede in schede_match:
        values = schede.split(', ')
        archivio_schede[archivio].extend(values)

# Counting occurrences of each value for 'schede tematiche' grouped by 'archivio'
archivio_counts = {archivio: Counter(values) for archivio, values in archivio_schede.items()}

# Create a list to store DataFrame rows for specified values and others
specified_values = ["AG", "AS", "AZ","DM", "DG", "EP67", "EP", "GGP", "A", "GB", "HJ", "LdF","M", "MdS", "SF", "VM", "CdU", "GASP", "PdO", "PLF", "TO", "TE", "L'A", "CR", "CdD", "GG", "MdF", "M", "MM", "VS", "Biz", "MdI", "NS", "NDF", "DT", "P", "QP", "RD", "RAI", "RI", "Opere I", "Opere II", "Opere III", "Opere IV", "SA", "TR", "UI", "FU", "VLC", "VB", "Conf", "LaP", "SD - VERSILIA", "SD", "VLC", "RAI", "L'A", "FU", "QP", "SD","AG", "Bizz", "LiM"]
# Extract all values from archivio_counts
all_values = set()
for counts in archivio_counts.values():
    all_values.update(counts.keys())

# Find values that are not in specified_values
other_values = all_values - set(specified_values)

# Create DataFrames for specified values and other values
specified_data = []
other_data = []

for archivio, counts in archivio_counts.items():
    for value, count in counts.items():
        if value in specified_values:
            specified_data.append({'Value': value, 'Archivio': archivio, 'Count': count})
        elif value in other_values:
            other_data.append({'Value': value, 'Archivio': archivio, 'Count': count})

# Create DataFrame for other values only
df_other = pd.DataFrame(other_data)

# Save DataFrame to an Excel file
other_excel_path = '/Users/martinapensalfini/Desktop/gadda/other_values.xlsx'
df_other.to_excel(other_excel_path, index=False)

print(f"Other Values DataFrame saved to: {other_excel_path}")


Other Values DataFrame saved to: /Users/martinapensalfini/Desktop/gadda/other_values.xlsx


In [14]:
import json
import re
import pandas as pd
from collections import Counter

file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Pattern to match values after "tipo:" within the "Internal Description" field
pattern = r'tipo:\s(.*?)(?:\n|$)'

# Extracting values matching the pattern from the "Internal Description" field of each item
matches = []
for item in json_data:
    internal_description = item.get("Internal Description", "")
    match = re.findall(pattern, internal_description)
    if match:
        # Splitting matches by comma and adding individual elements
        matches.extend([m.strip() for m in match[0].split(',')])

# If no matches were found
if not matches:
    print("No matches found for the pattern 'tipo:' in the 'Internal Description' field.")
else:
    # Counting occurrences of each value
    value_counts = Counter(matches)

    # Create a list to store DataFrame rows
    data = [{'Value': value, 'Count': count} for value, count in value_counts.items()]

    # Create a DataFrame from the list of rows
    df = pd.DataFrame(data)

    # Display the DataFrame
    print(df)


              Value  Count
0          Quaderno    152
1          Taccuino      5
2            Diario      6
3            Foglio    430
4         Fotocopia     47
..              ...    ...
85  Carta da pacchi      2
86            Album      2
87        Biglietto      1
88  Bozza di stampa      4
89     Raccomandata      1

[90 rows x 2 columns]


In [15]:
df.to_excel("tipo.xlsx")

In [18]:
import json
import re
import pandas as pd
from collections import Counter


file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Pattern to match values after "tipo:" within the "Internal Description" field
pattern = r'forma:\s(.*?)(?:\n|$)'

# Extracting values matching the pattern from the "Internal Description" field of each item
matches = []
for item in json_data:
    internal_description = item.get("External Description", "")
    match = re.findall(pattern, internal_description)
    if match:
        # Splitting matches by comma and adding individual elements
        matches.extend([m.strip() for m in match[0].split(',')])

# If no matches were found
if not matches:
    print("No matches found for the pattern 'tipo:' in the 'Internal Description' field.")
else:
    # Counting occurrences of each value
    value_counts = Counter(matches)

    # Create a list to store DataFrame rows
    data = [{'Value': value, 'Count': count} for value, count in value_counts.items()]

    # Create a DataFrame from the list of rows
    df = pd.DataFrame(data)


    print(df)


             Value  Count
0          Oggetto    435
1      Manoscritto    446
2           Stampa    580
3            Bozza     19
4   Dattiloscritto     49
5             Foto     41
6          Disegno      9
7  Bozza di stampa      3


In [19]:
df.to_excel("forma.xlsx")

In [22]:
import json
import re
import pandas as pd
from collections import Counter

# Path to your file containing the JSON-like data
file_path = 'mergecolumntipo.json'

# Read the content of the file
with open(file_path, 'r') as file:
    json_data = json.load(file)

# Pattern to match values after "tipo:" within the "Internal Description" field
pattern = r'supporto:\s(.*?)(?:\n|$)'

# Extracting values matching the pattern from the "Internal Description" field of each item
matches = []
for item in json_data:
    internal_description = item.get("External Description", "")
    match = re.findall(pattern, internal_description)
    if match:
        # Splitting matches by comma and adding individual elements
        matches.extend([m.strip() for m in match[0].split(',')])

# If no matches were found
if not matches:
    print("No matches found for the pattern 'tipo:' in the 'Internal Description' field.")
else:
    # Counting occurrences of each value
    value_counts = Counter(matches)

    # Create a list to store DataFrame rows
    data = [{'Value': value, 'Count': count} for value, count in value_counts.items()]

    # Create a DataFrame from the list of rows
    df = pd.DataFrame(data)

    print(df)


              Value  Count
0          Quaderno    155
1          Cartella    116
2         Fascicolo     26
3            Foglio    496
4          Stampato    525
5             Busta     72
6      Raccoglitore      3
7           Involto      3
8            Volume     51
9   Oggetto diverso    119
10          Rubrica      3


In [23]:
df.to_excel("support.xlsx")