In [5]:
# Install spaCy and download the English language model
!pip install spacy
!python -m spacy download en_core_web_sm

# Install transformers for NLP models
!pip install transformers

# Install networkx for graph visualization
!pip install networkx

# Install matplotlib for plotting
!pip install matplotlib

# Install Plotly for interactive visualizations
!pip install plotly

# Install nltk for additional NLP tasks
!pip install nltk


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m72.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
import spacy
from transformers import pipeline
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import nltk
import re
import pandas as pd
import numpy as np

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [7]:
# Sample news articles
news_articles = [
    {
        'source': 'News Agency 1',
        'date': '2023-04-03',
        'content': 'An oil spill occurred near the coast due to a pipeline failure. Immediate wildlife deaths have been reported.'
    },
    {
        'source': 'News Agency 2',
        'date': '2023-04-04',
        'content': 'Local communities are protesting against the environmental damage caused by the oil spill.'
    }
]

# Sample social media posts
social_media_posts = [
    {
        'user': '@environmental_activist',
        'date': '2023-04-04',
        'content': 'Heartbreaking to see the oil spill devastate our coastline. #SaveOurOcean'
    },
    {
        'user': '@local_resident',
        'date': '2023-04-05',
        'content': 'Water contamination is affecting our health. When will the authorities act? #OilSpill'
    }
]

# Sample government reports
government_reports = [
    {
        'department': 'Environmental Agency',
        'date': '2023-04-05',
        'content': 'The oil spill has affected a 50-mile radius. Cleanup operations will commence tomorrow.'
    }
]

# Sample weather data
weather_data = [
    {
        'date': '2023-04-03',
        'tide': 'High tide',
        'description': 'Unusual tidal patterns observed, potentially spreading the oil spill further.'
    }
]


In [8]:
def extract_events(text):
    doc = nlp(text)
    events = []
    for sent in doc.sents:
        verbs = [token for token in sent if token.pos_ == 'VERB']
        for verb in verbs:
            subject = ''
            objects = []
            for child in verb.children:
                if child.dep_ in ['nsubj', 'nsubjpass']:
                    subject = child.text
                if child.dep_ in ['dobj', 'pobj', 'dative']:
                    objects.append(child.text)
            event = {
                'sentence': sent.text,
                'verb': verb.lemma_,
                'subject': subject,
                'objects': objects
            }
            events.append(event)
    return events


In [9]:


# Update weather data to include a 'content' key
for entry in weather_data:
    entry['content'] = entry['description']

# Combine all data into one list
all_data = news_articles + social_media_posts + government_reports + weather_data

# Extract events
all_events = []
for entry in all_data:
    # Safely access 'content'
    text = entry.get('content')  # Use .get() to avoid KeyError
    if text:  # Ensure 'content' exists
        events = extract_events(text)
        for event in events:
            event['date'] = entry.get('date', 'Unknown')  # Default to 'Unknown' if 'date' is missing
            all_events.append(event)
    else:
        print(f"Skipping entry due to missing 'content': {entry}")

# Print extracted events for verification
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(all_events)



[   {   'date': '2023-04-03',
        'objects': [],
        'sentence': 'An oil spill occurred near the coast due to a pipeline '
                    'failure.',
        'subject': 'spill',
        'verb': 'occur'},
    {   'date': '2023-04-03',
        'objects': [],
        'sentence': 'Immediate wildlife deaths have been reported.',
        'subject': 'deaths',
        'verb': 'report'},
    {   'date': '2023-04-04',
        'objects': [],
        'sentence': 'Local communities are protesting against the '
                    'environmental damage caused by the oil spill.',
        'subject': 'communities',
        'verb': 'protest'},
    {   'date': '2023-04-04',
        'objects': [],
        'sentence': 'Local communities are protesting against the '
                    'environmental damage caused by the oil spill.',
        'subject': '',
        'verb': 'cause'},
    {   'date': '2023-04-04',
        'objects': [],
        'sentence': 'Heartbreaking to see the oil spill devas

In [10]:
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(all_events)


[   {   'date': '2023-04-03',
        'objects': [],
        'sentence': 'An oil spill occurred near the coast due to a pipeline '
                    'failure.',
        'subject': 'spill',
        'verb': 'occur'},
    {   'date': '2023-04-03',
        'objects': [],
        'sentence': 'Immediate wildlife deaths have been reported.',
        'subject': 'deaths',
        'verb': 'report'},
    {   'date': '2023-04-04',
        'objects': [],
        'sentence': 'Local communities are protesting against the '
                    'environmental damage caused by the oil spill.',
        'subject': 'communities',
        'verb': 'protest'},
    {   'date': '2023-04-04',
        'objects': [],
        'sentence': 'Local communities are protesting against the '
                    'environmental damage caused by the oil spill.',
        'subject': '',
        'verb': 'cause'},
    {   'date': '2023-04-04',
        'objects': [],
        'sentence': 'Heartbreaking to see the oil spill devas

In [11]:
# Convert date strings to datetime objects for sorting
from datetime import datetime

for event in all_events:
    event['date'] = datetime.strptime(event['date'], '%Y-%m-%d')

# Sort events by date
all_events_sorted = sorted(all_events, key=lambda x: x['date'])


In [12]:
# Define a mapping of coreferences
coreference_mapping = {
    'oil spill': ['spill', 'oil leak', 'incident'],
    'government': ['authorities', 'government', 'Environmental Agency'],
    'local communities': ['residents', 'local community', 'local communities'],
}

def resolve_coreferences(text):
    for key, values in coreference_mapping.items():
        for value in values:
            text = re.sub(r'\b' + value + r'\b', key, text, flags=re.IGNORECASE)
    return text


In [13]:
for event in all_events_sorted:
    event['sentence'] = resolve_coreferences(event['sentence'])
    event['subject'] = resolve_coreferences(event['subject'])
    event['objects'] = [resolve_coreferences(obj) for obj in event['objects']]


In [14]:
pp.pprint(all_events_sorted)


[   {   'date': datetime.datetime(2023, 4, 3, 0, 0),
        'objects': [],
        'sentence': 'An oil oil spill occurred near the coast due to a '
                    'pipeline failure.',
        'subject': 'oil spill',
        'verb': 'occur'},
    {   'date': datetime.datetime(2023, 4, 3, 0, 0),
        'objects': [],
        'sentence': 'Immediate wildlife deaths have been reported.',
        'subject': 'deaths',
        'verb': 'report'},
    {   'date': datetime.datetime(2023, 4, 3, 0, 0),
        'objects': [],
        'sentence': 'Unusual tidal patterns observed, potentially spreading '
                    'the oil oil spill further.',
        'subject': 'patterns',
        'verb': 'observe'},
    {   'date': datetime.datetime(2023, 4, 3, 0, 0),
        'objects': ['oil spill'],
        'sentence': 'Unusual tidal patterns observed, potentially spreading '
                    'the oil oil spill further.',
        'subject': '',
        'verb': 'spread'},
    {   'date': datetim

In [16]:
causal_keywords = ['cause', 'lead to', 'result in', 'due to', 'because', 'prompt', 'affect']


In [17]:
def find_causal_relations(events):
    causal_relations = []
    for i, event_i in enumerate(events):
        for j, event_j in enumerate(events):
            if i != j and event_i['date'] <= event_j['date']:
                text = event_j['sentence']
                for keyword in causal_keywords:
                    pattern = r'\b' + re.escape(keyword) + r'\b'
                    if re.search(pattern, text, re.IGNORECASE):
                        if event_i['verb'] in text or event_i['subject'] in text:
                            relation = {
                                'source_event': event_i,
                                'target_event': event_j,
                                'relation': keyword
                            }
                            causal_relations.append(relation)
    return causal_relations

causal_relations = find_causal_relations(all_events_sorted)


In [18]:
pp.pprint(causal_relations)


[   {   'relation': 'due to',
        'source_event': {   'date': datetime.datetime(2023, 4, 3, 0, 0),
                            'objects': ['oil spill'],
                            'sentence': 'Unusual tidal patterns observed, '
                                        'potentially spreading the oil oil '
                                        'spill further.',
                            'subject': '',
                            'verb': 'spread'},
        'target_event': {   'date': datetime.datetime(2023, 4, 3, 0, 0),
                            'objects': [],
                            'sentence': 'An oil oil spill occurred near the '
                                        'coast due to a pipeline failure.',
                            'subject': 'oil spill',
                            'verb': 'occur'}}]


In [19]:
# Adding context from weather data
contextual_events = []
for event in all_events_sorted:
    # If the event date matches the weather data date, add context
    for weather in weather_data:
        weather_date = datetime.strptime(weather['date'], '%Y-%m-%d')
        if event['date'] == weather_date:
            event['context'] = weather['description']
    contextual_events.append(event)


In [20]:
sentiment_pipeline = pipeline('sentiment-analysis')


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [21]:
subjective_words = ['devastating', 'heartbreaking', 'significant', 'immediate']
neutral_alternatives = {
    'devastating': 'severe',
    'heartbreaking': 'concerning',
    'significant': 'notable',
    'immediate': 'prompt'
}


In [22]:
def detect_bias(text):
    # Sentiment Analysis
    sentiment = sentiment_pipeline(text)[0]
    # Subjective Language Detection
    flagged_words = []
    for word in subjective_words:
        if re.search(r'\b' + word + r'\b', text, re.IGNORECASE):
            flagged_words.append(word)
    return {
        'sentiment': sentiment,
        'flagged_words': flagged_words
    }


In [23]:
for event in contextual_events:
    bias_info = detect_bias(event['sentence'])
    event['bias_info'] = bias_info


In [24]:
pp.pprint(contextual_events)


[   {   'bias_info': {   'flagged_words': [],
                         'sentiment': {   'label': 'NEGATIVE',
                                          'score': 0.9993553757667542}},
        'context': 'Unusual tidal patterns observed, potentially spreading the '
                   'oil spill further.',
        'date': datetime.datetime(2023, 4, 3, 0, 0),
        'objects': [],
        'sentence': 'An oil oil spill occurred near the coast due to a '
                    'pipeline failure.',
        'subject': 'oil spill',
        'verb': 'occur'},
    {   'bias_info': {   'flagged_words': ['immediate'],
                         'sentiment': {   'label': 'NEGATIVE',
                                          'score': 0.9977092742919922}},
        'context': 'Unusual tidal patterns observed, potentially spreading the '
                   'oil spill further.',
        'date': datetime.datetime(2023, 4, 3, 0, 0),
        'objects': [],
        'sentence': 'Immediate wildlife deaths have been 

In [25]:
def suggest_neutral_language(text):
    suggestions = {}
    for word in subjective_words:
        if re.search(r'\b' + word + r'\b', text, re.IGNORECASE):
            suggestions[word] = neutral_alternatives[word]
            text = re.sub(r'\b' + word + r'\b', neutral_alternatives[word], text, flags=re.IGNORECASE)
    return text, suggestions


In [26]:
G = nx.DiGraph()

# Add nodes
for event in contextual_events:
    event_id = id(event)
    G.add_node(event_id, label=event['sentence'], date=event['date'].strftime('%Y-%m-%d'))

# Add edges based on causal relations
for relation in causal_relations:
    source_id = id(relation['source_event'])
    target_id = id(relation['target_event'])
    G.add_edge(source_id, target_id, relation=relation['relation'])


In [27]:
pos = nx.spring_layout(G)


In [28]:
edge_x = []
edge_y = []
for edge in G.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

node_x = []
node_y = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(G.nodes[node]['label'])


In [33]:
import plotly.express as px
import plotly.graph_objects as go

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color='#888'),
    hoverinfo='none',
    mode='lines')

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        showscale=False,
        color='#FFA07A',
        size=10,
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='Causal Graph of Events',
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20,l=5,r=5,t=40)))

fig.show()


In [30]:
summarizer = pipeline('summarization')


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [31]:
# Concatenate all event sentences
event_sentences = ' '.join([event['sentence'] for event in contextual_events])


In [30]:
summary = summarizer(event_sentences, max_length=130, min_length=30, do_sample=False)
print("Summary:")
print(summary[0]['summary_text'])


Your max_length is set to 130, but your input_length is only 125. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)


Summary:
 An oil oil spill occurred near the coast due to a pipeline failure . Immediate wildlife deaths have been reported . Local communities are protesting against the environmental damage caused by the spill .


In [42]:
print("Normalized Node Texts:", list(normalized_node_to_id.keys()))
print("Normalized Causal Link Sources and Targets:", [source for source, _ in normalized_causal_links])
print("Normalized Causal Link Sources and Targets:", [target for _, target in normalized_causal_links])


Normalized Node Texts: ['an oil oil spill occurred near the coast due to a pipeline failure.', 'immediate wildlife deaths have been reported.', 'local communities are protesting against the environmental damage caused by the oil oil spill.', 'heartbreaking to see the oil oil spill devastate our coastline.', 'water contamination is affecting our health.', 'when will the government act?', 'the oil oil spill has affected a 50-mile radius.', 'cleanup operations will commence tomorrow.']
Normalized Causal Link Sources and Targets: ['an oil spill occurred near the coast due to a pipeline failure.', 'an oil spill occurred near the coast due to a pipeline failure.', 'an oil spill occurred near the coast due to a pipeline failure.', 'water contamination is affecting the health of residents.']
Normalized Causal Link Sources and Targets: ['wildlife deaths have been reported.', 'local communities are protesting against the environmental damage caused by the oil spill.', 'water contamination is aff

In [45]:
# Normalize a string (lowercase and strip extra spaces)
def normalize(text):
    return text.lower().strip()

# Create normalized mappings for nodes
normalized_node_to_id = {normalize(node): idx for idx, node in enumerate(node_text)}

# Use fuzzy matching to map causal links
from fuzzywuzzy import process

updated_causal_links = []
for source, target in causal_links:
    source_match = process.extractOne(normalize(source), normalized_node_to_id.keys())
    target_match = process.extractOne(normalize(target), normalized_node_to_id.keys())

    if source_match and target_match:
        updated_causal_links.append((source_match[0], target_match[0]))

print("Updated Causal Links:", updated_causal_links)

# Generate edge coordinates
edge_x = []
edge_y = []

for source, target in updated_causal_links:
    if source in normalized_node_to_id and target in normalized_node_to_id:
        source_idx = normalized_node_to_id[source]
        target_idx = normalized_node_to_id[target]
        x0, y0 = node_x[source_idx], node_y[source_idx]
        x1, y1 = node_x[target_idx], node_y[target_idx]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

print("Edge X coordinates:", edge_x)
print("Edge Y coordinates:", edge_y)

# Create edge trace
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color='blue'),
    hoverinfo='none',
    mode='lines'
)

# Create node trace
node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    hoverinfo='text',
    text=node_text,
    textposition="top center",
    marker=dict(
        showscale=False,
        color='#FFA07A',
        size=10,
        line_width=2
    )
)

# Build the graph
fig = go.Figure(data=[edge_trace, node_trace],
             layout=go.Layout(
                title='Causal Graph of Events with Edges',
                showlegend=False,
                hovermode='closest',
                margin=dict(b=20, l=5, r=5, t=40)))

fig.show()






Updated Causal Links: [('an oil oil spill occurred near the coast due to a pipeline failure.', 'immediate wildlife deaths have been reported.'), ('an oil oil spill occurred near the coast due to a pipeline failure.', 'local communities are protesting against the environmental damage caused by the oil oil spill.'), ('an oil oil spill occurred near the coast due to a pipeline failure.', 'water contamination is affecting our health.'), ('water contamination is affecting our health.', 'cleanup operations will commence tomorrow.')]
Edge X coordinates: [-0.016247182426516474, -1.0, None, -0.016247182426516474, -0.5618974972488427, None, -0.016247182426516474, 0.9150370272842635, None, 0.9150370272842635, 0.7785628011726281, None]
Edge Y coordinates: [0.9672316966114425, -0.18699069366525256, None, 0.9672316966114425, 0.8329586718381679, None, 0.9672316966114425, -0.11087037628605895, None, -0.11087037628605895, -0.6526839095811819, None]


In [44]:
!pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
