In [5]:
import os
import json
import spacy
import re
from datetime import datetime

# Load the SpaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Define directories and ensure they exist
DATA_DIR = os.path.join(os.getcwd(), 'data/raw')
GRAPH_DIR = os.path.join(os.getcwd(), 'data/graph')

# Ensure the directory for graph data exists
os.makedirs(GRAPH_DIR, exist_ok=True)

# Check current working directory for debugging
print("Current working directory:", os.getcwd())

# Load raw news data
news_files = [f for f in os.listdir(DATA_DIR) if f.endswith('.json')]

# Process news articles
articles = []
for file in news_files:
    with open(os.path.join(DATA_DIR, file), 'r') as f:
        articles.extend(json.load(f))

# Function to extract entities and events from articles
def extract_entities_and_events(article):
    doc = nlp(article['content'])
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE', 'DATE']]
    events = []
    for sent in doc.sents:
        if any(kw in sent.text for kw in ['announced', 'launched', 'started', 'reported', 'held']):
            date_match = [ent.text for ent in sent.ents if ent.label_ == 'DATE']
            location_match = [ent.text for ent in sent.ents if ent.label_ == 'GPE']
            events.append({
                'event': sent.text,
                'location': location_match[0] if location_match else None,
                'timestamp': date_match[0] if date_match else article['publishedAt']
            })
    return entities, events

# Function to generate Cypher queries for each article
def generate_cypher_queries_for_article(article, article_id):
    queries = []
    for entity, entity_type in article['entities']:
        if entity_type == 'PERSON':
            queries.append(f"MERGE (e:Q5 {{name: '{entity}'}})")  # Q5 for human
        elif entity_type == 'ORG':
            queries.append(f"MERGE (e:Q43229 {{name: '{entity}'}})")  # Q43229 for organization
        elif entity_type == 'GPE':
            queries.append(f"MERGE (e:Q515 {{name: '{entity}'}})")  # Q515 for location
    for event in article['events']:
        event_text = re.sub(r'[^\w\s]', '', event['event'])
        timestamp = event['timestamp'] if event['timestamp'] else 'NULL'
        location = event['location'] if event['location'] else 'NULL'
        queries.append(f"MERGE (ev:Q1656682 {{description: '{event_text}', timestamp: '{timestamp}'}})\n"
                       f"MERGE (loc:Q515 {{name: '{location}'}})\n"
                       f"MERGE (ev)-[:OCCURRED_AT]->(loc)")
    
    cypher_file_path = os.path.join(GRAPH_DIR, f'article_{article_id}.cypher')
    with open(cypher_file_path, 'w') as f:
        for query in queries:
            f.write(query + '\n')
    return cypher_file_path

# Function to generate Graphviz DOT files for each article
def generate_graphviz_for_article(article, article_id):
    dot_lines = ['digraph G {', 'rankdir=LR;']
    for entity, entity_type in article['entities']:
        dot_lines.append(f'"{entity}" [label="{entity}", shape=circle];')
    for event in article['events']:
        event_text = re.sub(r'[^\w\s]', '', event['event'])
        location = event['location'] if event['location'] else 'Unknown Location'
        dot_lines.append(f'"{event_text}" -> "{location}" [label="OCCURRED_AT"];')
    dot_lines.append('}')
    
    dot_file_path = os.path.join(GRAPH_DIR, f'article_{article_id}.dot')
    with open(dot_file_path, 'w') as f:
        f.write('\n'.join(dot_lines))
    return dot_file_path

# Process each article, generating Cypher and Graphviz files
for article_id, article in enumerate(articles):
    print(f'Processing article {article_id}')
    article['entities'], article['events'] = extract_entities_and_events(article)
    cypher_file = generate_cypher_queries_for_article(article, article_id)
    print(f'Cypher queries saved to {cypher_file}')
    dot_file = generate_graphviz_for_article(article, article_id)
    print(f'Graphviz file saved to {dot_file}')

Current working directory: /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks
Processing article 0
Cypher queries saved to /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks/data/graph/article_0.cypher
Graphviz file saved to /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks/data/graph/article_0.dot
Processing article 1
Cypher queries saved to /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks/data/graph/article_1.cypher
Graphviz file saved to /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks/data/graph/article_1.dot
Processing article 2
Cypher queries saved to /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks/data/graph/article_2.cypher
Graphviz file saved to /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks/data/graph/article_2.dot
Processing article 3
Cypher queries saved to /Users/michaeloboyle/Documents/GitHub/scrantenna/notebooks/data/graph/article_3.cypher
Graphviz file saved to /Users/michaeloboyle/Documents/GitHub/scra