In [4]:
import spacy
import networkx as nx
import plotly.graph_objects as go
from datetime import datetime
import nltk
import re
from nltk.corpus import reuters

nltk.download('reuters')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load('en_core_web_sm')

coreference_mapping = {
    'oil spill': ['spill', 'oil leak', 'incident'],
    'government': ['authorities', 'government', 'Environmental Agency'],
    'local communities': ['residents', 'local community', 'local communities'],
}

def resolve_coreferences(text):
    for key, values in coreference_mapping.items():
        for value in values:
            text = re.sub(r'\b' + re.escape(value) + r'\b', key, text, flags=re.IGNORECASE)
    return text

def extract_events(text):
    doc = nlp(text)
    events = []
    for sent in doc.sents:
        verbs = [token for token in sent if token.pos_ == 'VERB']
        for verb in verbs:
            subject = ''
            objects = []
            for child in verb.children:
                if child.dep_ in ['nsubj', 'nsubjpass']:
                    subject = child.text
                if child.dep_ in ['dobj', 'pobj', 'dative']:
                    objects.append(child.text)
            event = {
                'sentence': sent.text,
                'verb': verb.lemma_,
                'subject': subject,
                'objects': objects
            }
            events.append(event)
    return events

def process_dataset(entries, date_key='date'):
    events = []
    for entry in entries:
        text = entry.get('content')
        if text:
            extracted_events = extract_events(text)
            for event in extracted_events:
                event_date = entry.get(date_key, 'Unknown')
                event['date'] = event_date
                events.append(event)
    return events

def load_reuters_data(max_lines=5000):
    file_ids = reuters.fileids()
    documents = []
    total_lines = 0

    for file_id in file_ids:
        content = reuters.raw(file_id)
        lines = content.splitlines()
        num_lines = len(lines)

        if total_lines + num_lines < max_lines:
            documents.append({'content': content, 'date': 'Unknown'})
            total_lines += num_lines
        else:
            remaining_lines = max_lines - total_lines
            limited_content = '\n'.join(lines[:remaining_lines])
            documents.append({'content': limited_content, 'date': 'Unknown'})
            break
    return documents

reuters_documents = load_reuters_data()
news_events = process_dataset(reuters_documents)

for event in news_events:
    date_str = event['date']
    try:
        if date_str != 'Unknown':
            event['date'] = datetime.strptime(date_str, '%Y-%m-%d')
        else:
            event['date'] = datetime.max
    except ValueError:
        event['date'] = datetime.max

all_events_sorted = sorted(news_events, key=lambda x: x['date'])

max_events = 100
all_events_sorted = all_events_sorted[:max_events]

for event in all_events_sorted:
    event['sentence'] = resolve_coreferences(event['sentence'])
    event['subject'] = resolve_coreferences(event['subject'])
    event['objects'] = [resolve_coreferences(obj) for obj in event['objects']]

G = nx.DiGraph()

for event in all_events_sorted:
    event_id = hash((event['sentence'], event['date']))
    G.add_node(event_id, label=event['sentence'], date=event['date'].strftime('%Y-%m-%d'))

pos = nx.kamada_kawai_layout(G)

edge_x, edge_y = [], []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

node_x, node_y, node_text = [], [], []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(f"{G.nodes[node]['label']} ({G.nodes[node]['date']})")

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color='#888'),
    hoverinfo='none',
    mode='lines')

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        color='#FFA07A',
        size=20,
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Causal Graph of Reuters Events',
                    title_x=0.5,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40)))

fig.show()

print("Extracted Events:")
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(all_events_sorted)


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Extracted Events:
[   {   'date': datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
        'objects': [],
        'sentence': 'ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n'
                    '  Mounting trade friction between the\n'
                    "  U.S. And Japan has raised fears among many of Asia's "
                    'exporting\n'
                    '  nations that the row could inflict far-reaching '
                    'economic\n'
                    '  damage, businessmen and officials said.\n'
                    '      ',
        'subject': 'EXPORTERS',
        'verb': 'fear'},
    {   'date': datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
        'objects': [],
        'sentence': 'ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n'
                    '  Mounting trade friction between the\n'
                    "  U.S. And Japan has raised fears among many of Asia's "
                    'exporting\n'
                    '  nations that the row c

In [6]:
import spacy
import networkx as nx
import plotly.graph_objects as go
from datetime import datetime
import nltk
import re
from nltk.corpus import reuters
from textblob import TextBlob
nltk.download('reuters')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load('en_core_web_sm')

coreference_mapping = {
    'oil spill': ['spill', 'oil leak', 'incident'],
    'government': ['authorities', 'government', 'Environmental Agency'],
    'local communities': ['residents', 'local community', 'local communities'],
}

def resolve_coreferences(text):
    for key, values in coreference_mapping.items():
        for value in values:
            text = re.sub(r'\b' + re.escape(value) + r'\b', key, text, flags=re.IGNORECASE)
    return text

def extract_events(text):
    doc = nlp(text)
    events = []
    for sent in doc.sents:
        verbs = [token for token in sent if token.pos_ == 'VERB']
        for verb in verbs:
            subject = ''
            objects = []
            for child in verb.children:
                if child.dep_ in ['nsubj', 'nsubjpass']:
                    subject = child.text
                if child.dep_ in ['dobj', 'pobj', 'dative']:
                    objects.append(child.text)
            event = {
                'sentence': sent.text,
                'verb': verb.lemma_,
                'subject': subject,
                'objects': objects
            }
            events.append(event)
    return events

def process_dataset(entries, date_key='date'):
    events = []
    for entry in entries:
        text = entry.get('content')
        if text:
            extracted_events = extract_events(text)
            for event in extracted_events:
                event_date = entry.get(date_key, 'Unknown')
                event['date'] = event_date
                events.append(event)
    return events

def load_reuters_data(max_lines=5000):
    file_ids = reuters.fileids()
    documents = []
    total_lines = 0

    for file_id in file_ids:
        content = reuters.raw(file_id)
        lines = content.splitlines()
        num_lines = len(lines)

        if total_lines + num_lines < max_lines:
            documents.append({'content': content, 'date': 'Unknown'})
            total_lines += num_lines
        else:
            remaining_lines = max_lines - total_lines
            limited_content = '\n'.join(lines[:remaining_lines])
            documents.append({'content': limited_content, 'date': 'Unknown'})
            break
    return documents

reuters_documents = load_reuters_data()

news_events = process_dataset(reuters_documents)

for event in news_events:
    date_str = event['date']
    try:
        if date_str != 'Unknown':
            event['date'] = datetime.strptime(date_str, '%Y-%m-%d')
        else:
            event['date'] = datetime.max
    except ValueError:
        event['date'] = datetime.max

all_events_sorted = sorted(news_events, key=lambda x: x['date'])

max_events = 100
all_events_sorted = all_events_sorted[:max_events]

for event in all_events_sorted:
    event['sentence'] = resolve_coreferences(event['sentence'])
    event['subject'] = resolve_coreferences(event['subject'])
    event['objects'] = [resolve_coreferences(obj) for obj in event['objects']]

def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

for event in all_events_sorted:
    event['sentiment'] = get_sentiment(event['sentence'])

causal_keywords = ['cause', 'lead to', 'result in', 'due to', 'because']

def find_causal_relations(events):
    causal_relations = []
    for i, event_i in enumerate(events):
        for j, event_j in enumerate(events):
            if i != j:
                for keyword in causal_keywords:
                    if keyword in event_j['sentence']:
                        causal_relations.append((event_i, event_j))
    return causal_relations

causal_relations = find_causal_relations(all_events_sorted)

G = nx.DiGraph()

for event in all_events_sorted:
    event_id = hash((event['sentence'], event['date']))
    G.add_node(event_id, label=event['sentence'], date=event['date'].strftime('%Y-%m-%d'), sentiment=event['sentiment'])

for source, target in causal_relations:
    source_id = hash((source['sentence'], source['date']))
    target_id = hash((target['sentence'], target['date']))
    G.add_edge(source_id, target_id, label='causes')

pos = nx.kamada_kawai_layout(G)

edge_x, edge_y, edge_labels = [], [], []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])
    edge_labels.append(edge[2]['label'])

node_x, node_y, node_text, node_colors = [], [], [], []
for node in G.nodes(data=True):
    x, y = pos[node[0]]
    node_x.append(x)
    node_y.append(y)
    node_text.append(f"{node[1]['label']} (Sentiment: {node[1]['sentiment']})")
    if node[1]['sentiment'] == 'positive':
        node_colors.append('#76c7c0')
    elif node[1]['sentiment'] == 'negative':
        node_colors.append('#ff6f61')
    else:
        node_colors.append('#ffa500')

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color='#000000'),
    hoverinfo='text',
    mode='lines')

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        color=node_colors,
        size=20,
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Enhanced Causal Graph of Reuters Events',
                    title_x=0.5,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40)))

fig.show()

print("Extracted Events:")
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(all_events_sorted)


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Extracted Events:
[   {   'date': datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
        'objects': [],
        'sentence': 'ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n'
                    '  Mounting trade friction between the\n'
                    "  U.S. And Japan has raised fears among many of Asia's "
                    'exporting\n'
                    '  nations that the row could inflict far-reaching '
                    'economic\n'
                    '  damage, businessmen and officials said.\n'
                    '      ',
        'sentiment': 'positive',
        'subject': 'EXPORTERS',
        'verb': 'fear'},
    {   'date': datetime.datetime(9999, 12, 31, 23, 59, 59, 999999),
        'objects': [],
        'sentence': 'ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RIFT\n'
                    '  Mounting trade friction between the\n'
                    "  U.S. And Japan has raised fears among many of Asia's "
                    'exporting\n'
            