In [None]:
# Temporal Information Extraction and Normalization Script

# Import necessary libraries
import spacy  # For natural language processing tasks
from dateparser.search import search_dates  # For parsing date/time expressions
from datetime import datetime  # For handling date/time objects

# Load English tokenizer, POS tagger, parser, NER, and word vectors
# 'en_core_web_sm' is a small English model in spaCy suitable for our purposes
nlp = spacy.load("en_core_web_sm")

# Define the input text (news narrative)
# This text contains several date and time expressions that we will extract and normalize
text = """
The Prime Minister announced yesterday that the new policy will take effect from next Monday.
This follows the previous announcement made on September 14, 2023.
Analysts expect that by the end of this year, significant changes will have occurred.
The committee's last meeting was two weeks ago, and the next one is scheduled for November 15th.
"""

# Process the text using spaCy
# The doc object will contain tokens, sentences, and named entities
doc = nlp(text)

# Initialize a list to hold extracted date/time expressions
date_time_expressions = []

# Iterate over the named entities in the doc
for ent in doc.ents:
    # Check if the entity label is 'DATE' or 'TIME'
    if ent.label_ in ("DATE", "TIME"):
        # Append the text of the entity to the list
        date_time_expressions.append(ent.text)

# Print the extracted date/time expressions
print("Extracted date/time expressions:")
for expr in date_time_expressions:
    print("-", expr)

# Now, normalize these date/time expressions
# Set a reference date for relative dates (e.g., 'yesterday')
# Here, we use the current date and time
reference_date = datetime.now()

# Initialize a dictionary to hold the normalized dates
normalized_dates = {}

# Use dateparser to parse and normalize date/time expressions
for expr in date_time_expressions:
    # Use search_dates to parse the date expression
    # We provide settings to handle relative dates based on the reference date
    result = search_dates(expr, settings={'RELATIVE_BASE': reference_date})
    if result:
        # Get the parsed date (the second element of the first tuple)
        parsed_date = result[0][1]
        # Format the date in ISO 8601 format for consistency
        iso_date = parsed_date.isoformat()
        # Add the original expression and the normalized date to the dictionary
        normalized_dates[expr] = iso_date

# Print the normalized date/time expressions
print("\nNormalized date/time expressions:")
for expr, norm_date in normalized_dates.items():
    print(f"- {expr}: {norm_date}")


Extracted date/time expressions:
- yesterday
- next Monday
- September 14, 2023
- the end of this year
- two weeks ago
- November 15th

Normalized date/time expressions:
- yesterday: 2024-11-06T13:53:56.934251
- next Monday: 2024-11-04T00:00:00
- September 14, 2023: 2023-09-14T00:00:00
- the end of this year: 2024-11-07T13:53:56.934251
- two weeks ago: 2024-10-24T13:53:56.934251
- November 15th: 2024-11-15T00:00:00


In [None]:
!pip install spacy dateparser


Collecting dateparser
  Downloading dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dateparser
Successfully installed dateparser-1.2.0


In [None]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
import csv
df = pd.read_csv('/content/guardian_environment_news 2.csv', quoting=csv.QUOTE_NONE, on_bad_lines='skip')
df


Unnamed: 0,Title,Intro Text,Authors,Article Text,Date Published
0,Liz Truss ‘will approve more oil drilling if ...,"""Tory leadership candidate criticised by campa...",,,
1,Politics live – latest updates,,,,
2,‘This year has been very good’: readers’ UK bu...,"""Readers share their favourite sightings over ...",after news that numbers have risen since last...,,
3,UK butterfly numbers bounce back after last ye...,,,,
4,Economy | Jeremy Hunt will announce £25bn wort...,the Guardian understands. The scale of the me...,,,
...,...,...,...,...,...
25897,Luke Buckmaster is film critic and writer for...,2021-01-20,,,
25898,This article was corrected on 24 May 2017. The...,not the US. And the warning referred to in th...,not the supreme court,,
25899,"""",2018-03-16,,,
25900,World's oldest known spider dies at 43 after ...,"""Female trapdoor spider known as Number 16 was...",,,


In [6]:
df = df.drop(['Date Published', 'Intro Text','Authors'], axis=1)

In [7]:
df.isnull().sum()

Unnamed: 0,0
Title,180
Article Text,23617


In [8]:
df = df.dropna()

In [9]:
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import string

# Download necessary NLTK data (if you haven't already)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if no specific tag is available

# Function to preprocess text (tokenize, remove punctuation, stopwords, POS tagging, lemmatization)
def preprocess_text(text):
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]

    # Remove non-alphabetic characters
    tokens = [word for word in tokens if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Get POS tags for tokens
    pos_tags = nltk.pos_tag(tokens)

    # Lemmatize tokens using POS tags
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]

    # Join the lemmatized tokens back into a string
    lemmatized_text = ' '.join(lemmatized_tokens)

    return lemmatized_text

# Step 4: Apply preprocessing and lemmatization with POS tagging to the dataset
df['lemmatized_article_text'] = df['Article Text'].apply(preprocess_text)
df['lemmatized_title'] = df['Title'].apply(preprocess_text)

# Step 5: Display the results
print(df[['Article Text', 'lemmatized_article_text', 'Title', 'lemmatized_title']])

# Optional: Save the lemmatized data to a new CSV file
df.to_csv('lemmatized_dataset.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                            Article Text  \
10      onions... the list goes on. His account of ta...   
29                                 of Fridays for Future   
36      but Queenslanders are less concerned than the...   
45                                           experts say   
68                             not a government inquiry.   
...                                                  ...   
25809   ask later’ tactics of scooter hire companies ...   
25815           smothered roads and destroyed landscapes   
25827   or elsewhere? Have you found an alternative? ...   
25879   burial grounds near the reservation and other...   
25881   have become centers for drugs and sexual viol...   

                                 lemmatized_article_text  \
10     onion list go account take fussy eat joy pictu...   
29                                         friday future   
36              queenslanders less concerned rest nation   
45                                     

In [11]:
!pip install dateparser


Collecting dateparser
  Downloading dateparser-1.2.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading dateparser-1.2.0-py2.py3-none-any.whl (294 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/295.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dateparser
Successfully installed dateparser-1.2.0


In [17]:
# Import necessary libraries
import spacy  # For natural language processing tasks
from dateparser.search import search_dates  # For parsing date/time expressions
from datetime import datetime  # For handling date/time objects
from operator import itemgetter  # For sorting events based on time
import pandas as pd  # For handling CSV data

# Load English tokenizer, POS tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Define the path to the CSV file
csv_file_path = "/content/lemmatized_dataset.csv"  # Replace with the path to your CSV file

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Check if the 'Title' column exists
if 'Title' not in df.columns:
    raise ValueError("CSV file must contain a 'Title' column with narratives.")

# Initialize a list to hold events with their associated date/time expressions
events = []

# Set a reference date for relative dates (e.g., 'yesterday')
reference_date = datetime.now()

# Iterate over each row in the DataFrame
for _, row in df.iterrows():
    # Get the sentence or narrative text from the 'Title' column
    sentence_text = row['Title'].strip()
    # Process the text with spaCy
    doc = nlp(sentence_text)
    # Search for date/time expressions in the text
    result = search_dates(sentence_text, settings={'RELATIVE_BASE': reference_date})
    if result:
        # If date/time expressions are found, extract and normalize them
        for match_text, match_date in result:
            # Convert all dates to offset-naive by removing timezone info
            match_date_naive = match_date.replace(tzinfo=None)
            # Append the event as a tuple of (normalized_date, sentence)
            events.append((match_date_naive, sentence_text))

# Sort the events based on the datetime
events.sort(key=itemgetter(0))

# Construct the timeline
print("Constructed Timeline of Events:")
if events:
    for event_time, event_desc in events:
        # Format the event time for display
        display_time = event_time.strftime("%B %d, %Y, %I:%M %p")
        print(f"{display_time}: {event_desc}")
else:
    print("No date-related events found.")


Constructed Timeline of Events:
November 10, 1524, 03:07 PM: More than 500 years ago
November 10, 1759, 12:00 AM: The connection between public education and recreation was always part of the plan for Derby Arboretum. But even trees planted centuries ago as private collections have since come into the public domain. Princess Augusta’s extensions to Kew Gardens in 1759 are today seen by two million visitors a year. Similarly
November 10, 1950, 12:00 AM: Americans badly underestimate the expert climate consensusNumerous papers have shown that over 90% of climate science experts agree that humans are the main cause of global warming since 1950
November 10, 1964, 03:07 PM: "60 years on since Silent Spring
November 10, 1970, 12:00 AM: Lose the Beltway mindset. It’s not just the Green New Deal that is popular with the broader public. Many of the subsidiary policies – such as Medicare for All and free daycare – are now supported by upwards of 70% of the American public
November 10, 1975, 12:0