For the visualization of the people-event network from the CESTA events data
    1. csv of events with descriptions
    2. each event will become a node (JSON)
        event-node
        - title: event title from the csv
        - info: event info from the csv
        - link: also from the csv
        - year: extracted from date on csv
        - color: different color for each year -> https://identity.stanford.edu/design-elements/color/accent-colors/
        - size: smaller than people
    3. NER on the event data
    4. For each event, identify the unique people in the description
    5. each person will become a node
        person-node
        - title: person name
        - info: event titles and links that the person is mentioned in
        - color: same color for all persons, using a compatible color from the palette as the event-nodes
        - size: growing with number of events
    6. Color of the link will be grey
    7. Layout of the graph will be force-directed
    8. We will display people names on nodes but not event names 



In [30]:
import os
import re
import pandas as pd
import csv
import numpy as np
import json

{"nodes":
 {"id": unique_id,
  "labels": ["event"],
  "properties": {
    "name": event_name,
    "date": event_date,
    "location": event_location,
    "description": event_description,
    "url": event_url,
    },
    "value": occurence_count
    },
 {"id": unique_id,
  "labels": ["person"],
  "properties": {
    "name": person_name,
    "events": {
        "title": event_name,
        "link": url
    },
    "value": event_count
    },
    ....
 },
 }

In [1]:
# Run the NER on event descriptions

import stanza
# Download the English model
stanza.download('en')


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-28 14:47:07 INFO: Downloaded file to /Users/mervetekgurler/stanza_resources/resources.json
2024-05-28 14:47:07 INFO: Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.8.0/models/default.zip:   0%|          | 0…

2024-05-28 14:47:25 INFO: Downloaded file to /Users/mervetekgurler/stanza_resources/en/default.zip
2024-05-28 14:47:31 INFO: Finished downloading models and saved to /Users/mervetekgurler/stanza_resources


In [6]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

2024-05-28 14:57:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-05-28 14:57:31 INFO: Downloaded file to /Users/mervetekgurler/stanza_resources/resources.json
2024-05-28 14:57:32 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

2024-05-28 14:57:32 INFO: Using device: cpu
2024-05-28 14:57:32 INFO: Loading: tokenize
2024-05-28 14:57:33 INFO: Loading: mwt
2024-05-28 14:57:33 INFO: Loading: ner
2024-05-28 14:57:34 INFO: Done loading processors!


In [11]:
def extract_entities(text):
    if not isinstance(text, str):
        return '', ''
    
    doc = nlp(text)
    people = []
    all_entities = []

    for ent in doc.entities:
        all_entities.append(f"{ent.text} ({ent.type})")
        if ent.type == 'PERSON':
            people.append(ent.text)

    return '; '.join(all_entities), ', '.join(people)

In [12]:
# Read the CSV file
input_file_path = '/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/events/event_data_edited.csv'
df = pd.read_csv(input_file_path)

# Extract people from Paragraph Content
entities_data = []

for index, row in df.iterrows():
    combined_text = ' '.join([str(row['Title']), str(row['Meta Description']), str(row['H1 Content']), str(row['Paragraph Content'])])
    all_entities, people = extract_entities(combined_text)
    entities_data.append({
        'File Name': row['File Name'],
        'Title': row['Title'],
        'Meta Description': row['Meta Description'],
        'H1 Content': row['H1 Content'],
        'Paragraph Content': row['Paragraph Content'],
        'Date': row['Date'],
        'Location': row['Location'],
        'All Entities': all_entities,
        'People': people
    })


# Create a DataFrame from the entities data
entities_df = pd.DataFrame(entities_data)

# Save to CSV
output_file_path = '/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_stanza_ner.csv'
entities_df.to_csv(output_file_path, index=False)


In [23]:
# now we need to identify the unique people and create another column

# Read the CSV file
input_file_path = '/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_stanza_ner.csv'
df = pd.read_csv(input_file_path)

def clean_name(name):
    # Normalize curly apostrophes to straight apostrophes
    name = name.replace('’', "'")
    # Remove trailing possessive 's and any trailing punctuation
    name = re.sub(r"['’]s$", "", name)  # Remove possessive 's
    name = re.sub(r"[.,’']$", "", name)  # Remove trailing punctuation
    return name.strip()

def consolidate_unique_people(people_str):
    if pd.isna(people_str):
        return ''
    
    # Split the people by comma, clean names, and strip any extra spaces
    people_list = [clean_name(person) for person in people_str.split(',')]
    
    # Create a dictionary to map first names to full names
    name_dict = {}
    for person in people_list:
        names = person.split()
        if len(names) > 1:
            first_name = names[0]
            last_name = names[-1]
            name_dict[first_name] = person
            name_dict[last_name] = person  # This helps in case only last names are mentioned

    # Create a set for unique people
    unique_people_set = set()
    for person in people_list:
        names = person.split()
        if len(names) == 1:
            # If only first or last name, replace with full name if available
            if names[0] in name_dict:
                unique_people_set.add(name_dict[names[0]])
            else:
                unique_people_set.add(person)
        else:
            unique_people_set.add(person)
    
    # Join the unique names back into a string
    unique_people_str = ', '.join(unique_people_set)
    
    return unique_people_str

# Apply the function to the 'People' column to create the 'Unique People' column
df['Unique People'] = df['People'].apply(consolidate_unique_people)

# Save the DataFrame with the new column to a new CSV file
output_file_path = '/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_unique_people.csv'
df.to_csv(output_file_path, index=False)
print(f"Data with unique people has been saved to {output_file_path}")

# Display the first few rows of the updated dataframe
df.head()

Data with unique people has been saved to /Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_unique_people.csv


Unnamed: 0,File Name,Title,Meta Description,H1 Content,Paragraph Content,Date,Location,All Entities,People,Unique People
0,1716323825-1.html,Letterpress Workshop | Center for Spatial and ...,Join us for a 2-hour workshop on letterpress p...,Letterpress Workshop,,"Fri May 17th 2024, 12:30 - 2:30pm",Product Realization Lab (PRL) at D-School,2-hour (TIME); Ane Knutsen (PERSON); Universit...,"Ane Knutsen, Claire Battershill, Alice Staveley","Claire Battershill, Ane Knutsen, Alice Staveley"
1,1716323825-10.html,Mellon Sawyer Seminar Series: Data of Enslavem...,Enslavement has been the subject of many big d...,Mellon Sawyer Seminar Series: Data of Enslavem...,Enslavement has been the subject of many big d...,"Thu April 25th 2024, 3:00 - 4:30pm","Wallenberg Hall, Room 433A",Mellon Sawyer Seminar Series: Data of Enslavem...,"Lauren Klein, Alex Borucki, Gregory O'Malley, ...","Matthew K. Gold, Catherine D'Ignazio, Greg O'M..."
2,1716323825-100.html,"Mark Algee-Hewitt: ""Re-Formations: Visualizing...",About this Talk: The eighteenth century was a ...,"Mark Algee-Hewitt: ""Re-Formations: Visualizing...",About this Talk: The eighteenth century was a ...,"Tue April 19th 2022, 12:00 - 1:15pm",Hybrid event: in-person at CESTA and via Zoom.,Mark Algee-Hewitt (PERSON); the Eighteenth Cen...,"Mark Algee-Hewitt, Mark Algee-Hewitt, Mark Alg...",Mark Algee-Hewitt
3,1716323825-101.html,"Felicia Smith, Nicole Coleman, Gabriela Basel,...",Note: this seminar will be a hybrid event. We ...,"Felicia Smith, Nicole Coleman, Gabriela Basel,...",Note: this seminar will be a hybrid event. We ...,"Tue April 12th 2022, 12:00 - 1:15pm",Hybrid event: in-person at CESTA and via Zoom.,Felicia Smith (PERSON); Nicole Coleman (PERSON...,"Felicia Smith, Nicole Coleman, Gabriela Basel,...","Felicia Smith, Mahogany Brown, Nerrise, Jim Cr..."
4,1716323825-102.html,"Grant Parker: ""Curating Enslaved Pasts of the ...",Speaker: Prof. Grant Parker (Stanford) Chair: ...,"Grant Parker: ""Curating Enslaved Pasts of the ...",About this talk: Our project seeks new ways of...,"Thu April 7th 2022, 9:00 - 10:00am",Online via Zoom,Grant Parker (PERSON); Curating Enslaved Pasts...,"Grant Parker, Grant Parker, Laura Stokes, Meis...","Grant Parker, Laura Stokes, Meishu Ai Grant Pa..."


In [25]:
# Load the CSV file from the first pass
df_first_pass = pd.read_csv('/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_unique_people.csv')

def replace_short_names(people_str, name_dict):
    if pd.isna(people_str):
        return ''
    
    # Split the people by comma, clean names, and strip any extra spaces
    people_list = [clean_name(person) for person in people_str.split(',')]
    
    # Replace shorter names with full names
    replaced_people_list = []
    for person in people_list:
        names = person.split()
        first_name = names[0]
        last_name = names[-1]
        if (first_name, last_name) in name_dict:
            replaced_people_list.append(name_dict[(first_name, last_name)])
        else:
            replaced_people_list.append(person)
    
    # Create a set for unique people
    unique_people_set = set(replaced_people_list)
    
    # Join the unique names back into a string
    unique_people_str = ', '.join(unique_people_set)
    
    return unique_people_str

# Create a dictionary to map first and last names to their full forms
name_dict = {}
for people in df_first_pass['Unique People'].dropna():
    for person in people.split(','):
        names = person.split()
        if len(names) > 1:
            first_name = names[0]
            last_name = names[-1]
            full_name = ' '.join(names)
            name_dict[(first_name, last_name)] = full_name

# Apply the second pass function to the 'Unique People' column
df_first_pass['Unique People'] = df_first_pass['Unique People'].apply(lambda x: replace_short_names(x, name_dict))

# Save the DataFrame with the updated column to a new CSV file
output_file_path_second_pass = '/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_unique_people_second_pass.csv'
df_first_pass.to_csv(output_file_path_second_pass, index=False)

print(f"Second pass data with unique people has been saved to {output_file_path_second_pass}")

# Display the first few rows of the updated dataframe
df_first_pass.head()

Second pass data with unique people has been saved to /Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_unique_people_second_pass.csv


Unnamed: 0,File Name,Title,Meta Description,H1 Content,Paragraph Content,Date,Location,All Entities,People,Unique People
0,1716323825-1.html,Letterpress Workshop | Center for Spatial and ...,Join us for a 2-hour workshop on letterpress p...,Letterpress Workshop,,"Fri May 17th 2024, 12:30 - 2:30pm",Product Realization Lab (PRL) at D-School,2-hour (TIME); Ane Knutsen (PERSON); Universit...,"Ane Knutsen, Claire Battershill, Alice Staveley","Claire Battershill, Ane Knutsen, Alice Staveley"
1,1716323825-10.html,Mellon Sawyer Seminar Series: Data of Enslavem...,Enslavement has been the subject of many big d...,Mellon Sawyer Seminar Series: Data of Enslavem...,Enslavement has been the subject of many big d...,"Thu April 25th 2024, 3:00 - 4:30pm","Wallenberg Hall, Room 433A",Mellon Sawyer Seminar Series: Data of Enslavem...,"Lauren Klein, Alex Borucki, Gregory O'Malley, ...","Matthew K. Gold, Catherine D'Ignazio, Greg O'M..."
2,1716323825-100.html,"Mark Algee-Hewitt: ""Re-Formations: Visualizing...",About this Talk: The eighteenth century was a ...,"Mark Algee-Hewitt: ""Re-Formations: Visualizing...",About this Talk: The eighteenth century was a ...,"Tue April 19th 2022, 12:00 - 1:15pm",Hybrid event: in-person at CESTA and via Zoom.,Mark Algee-Hewitt (PERSON); the Eighteenth Cen...,"Mark Algee-Hewitt, Mark Algee-Hewitt, Mark Alg...",Mark Algee-Hewitt
3,1716323825-101.html,"Felicia Smith, Nicole Coleman, Gabriela Basel,...",Note: this seminar will be a hybrid event. We ...,"Felicia Smith, Nicole Coleman, Gabriela Basel,...",Note: this seminar will be a hybrid event. We ...,"Tue April 12th 2022, 12:00 - 1:15pm",Hybrid event: in-person at CESTA and via Zoom.,Felicia Smith (PERSON); Nicole Coleman (PERSON...,"Felicia Smith, Nicole Coleman, Gabriela Basel,...","Felicia Smith, Mahogany Brown, Nerrise, Jim Cr..."
4,1716323825-102.html,"Grant Parker: ""Curating Enslaved Pasts of the ...",Speaker: Prof. Grant Parker (Stanford) Chair: ...,"Grant Parker: ""Curating Enslaved Pasts of the ...",About this talk: Our project seeks new ways of...,"Thu April 7th 2022, 9:00 - 10:00am",Online via Zoom,Grant Parker (PERSON); Curating Enslaved Pasts...,"Grant Parker, Grant Parker, Laura Stokes, Meis...","Grant Parker, Laura Stokes, Meishu Ai Grant Pa..."


This is as good as it gets with code. The rest can be consolidated at the network level.

In [31]:
# Load the CSV files
events_df = pd.read_csv('/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/events_unique_people_second_pass.csv')
urls_df = pd.read_csv('/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/events/webscraper_data.csv')

# Remove '.html' extension from filenames in the urls_df for merging
urls_df['filename'] = urls_df['web-scraper-order'].str.replace('.html', '')

# Add a 'filename' column to events_df without '.html' extension for merging
events_df['filename'] = events_df['File Name'].str.replace('.html', '')

# Merge the events data with URLs
events_df = events_df.merge(urls_df, left_on='filename', right_on='filename', how='left')

# Prepare the nodes list
nodes = []

# Process events
for _, row in events_df.iterrows():
    event_node = {
        "id": row['filename'],
        "labels": ["event"],
        "properties": {
            "name": row['Title'],
            "date": row['Date'],
            "location": row['Location'],
            "description": row['Meta Description'],
            "url": row['pagelink2-href'],
        },
        "value": 1  # Assuming occurrence count is 1 for each event
    }
    nodes.append(event_node)

# Process persons
persons_dict = {}

for _, row in events_df.iterrows():
    if pd.notna(row['Unique People']):
        persons = row['Unique People'].split(',')
        for person in persons:
            person = person.strip()
            if person not in persons_dict:
                persons_dict[person] = {
                    "id": person,
                    "labels": ["person"],
                    "properties": {
                        "name": person,
                        "events": []
                    },
                    "value": 0
                }
            persons_dict[person]["properties"]["events"].append({
                "title": row['Title'],
                "link": row['pagelink2-href']
            })
            persons_dict[person]["value"] += 1

# Add person nodes to the nodes list
for person_node in persons_dict.values():
    nodes.append(person_node)

# Combine into final JSON structure
final_json = {
    "nodes": nodes
}

# Save to JSON file
output_path = '/Users/mervetekgurler/Desktop/PhD/CESTA/cesta-events/data/network.json'
with open(output_path, 'w') as json_file:
    json.dump(final_json, json_file, indent=4)
