## Input news articles into the Stardog DNA repositories

There are 7 articles from different sources on the Florida Supreme Court ruling on banning abortions after 6 weeks (in the directory, articles/abortion-articles). And, there are 19 articles from different sources on the topics of the Trump-Harris presidential debate held on September 11 2024 (in the directory, articles/debate-articles).

Two repositories are created ("abortion" and "debate") with background data added to each. Then, the articles' sentences are converted to RDF, complying with the DNA Event Ontology, and ingested into the 'dna' Stardog Cloud repository.

To execute this notebook, make sure that: 
* The DNA application's flask server is running (execute 'flask run' from the dna directory)

In [1]:
import json
import requests

In [2]:
# Delete the repositories to guarantee a clean slate
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=abortion')
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=debate')
# Delete the test repository, foo
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=foo')

# Create the repositories for the news articles
response = requests.post('http://127.0.0.1:5000/dna/v1/repositories?repository=abortion')
response = requests.post('http://127.0.0.1:5000/dna/v1/repositories?repository=debate')
# Verify that the repositories are created
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories')
print(response.status_code, response.json()) 

200 [{'created': '2024-10-14T16:00:07', 'repository': 'debate'}, {'created': '2024-10-14T16:00:06', 'repository': 'abortion'}]


In [3]:
# Add background data
abortion_background = {
  "backgroundNames": [
    {"name": "Supreme Court of Florida", "type": "organization", "alsoKnownAs": ["Florida Supreme Court"]},
    {"name": "Supreme Court of the United States", "type": "organization", "alsoKnownAs": []},  # Use Wikidata info
    {"name": "Joe Biden", "type": "person", "alsoKnownAs": ["Joe", "Biden"]},
    {"name": "Donald Trump", "type": "person", "alsoKnownAs": ["Donald", "Trump"]},
    {"name": "Roe v. Wade", "type": "law", "alsoKnownAs": ["Roe", "Roe vs. Wade"]},
    {"name": "Democratic Party", "type": "norp", "alsoKnownAs": ["Democratic", "Democrat"]},
    {"name": "Republican Party", "type": "norp", "alsoKnownAs": ["Republican"]}
  ]
}
response = requests.post(f'http://127.0.0.1:5000/dna/v1/repositories/background?repository=abortion', 
                         json=abortion_background)
print(response.status_code, response.json()) 

print()

debate_background = {
  "backgroundNames": [
    {"name": "Kamala Harris", "type": "person", "alsoKnownAs": ["Kamala", "Harris"]},  
    {"name": "Joe Biden", "type": "person", "alsoKnownAs": ["Joe", "Biden"]},
    {"name": "Donald Trump", "type": "person", "alsoKnownAs": ["Donald", "Trump"]},
    {"name": "Tim Walz", "type": "person", "alsoKnownAs": ["Tim", "Walz", "Gov Walz"]},
    {"name": "ABC News", "type": "organization", "alsoKnownAs": ["ABC"]},
    {"name": "Democratic Party", "type": "norp", "alsoKnownAs": ["Democratic", "Democrat"]},
    {"name": "Republican Party", "type": "norp", "alsoKnownAs": ["Republican"]},
  ]
}
response = requests.post(f'http://127.0.0.1:5000/dna/v1/repositories/background?repository=debate', 
                         json=debate_background)
print(response.status_code, response.json()) 



201 {'processedNames': [{'isCollection': False, 'name': 'Florida Supreme Court', 'type': 'organization'}, {'isCollection': False, 'name': 'U.S. Supreme Court', 'type': 'organization'}, {'isCollection': False, 'name': 'Joe Biden', 'type': 'person'}, {'isCollection': False, 'name': 'Donald Trump', 'type': 'person'}, {'isCollection': False, 'name': 'Roe v. Wade', 'type': 'law'}, {'isCollection': False, 'name': 'Democrat', 'type': 'norp'}, {'isCollection': False, 'name': 'Republican', 'type': 'norp'}], 'repository': 'abortion', 'skippedNames': []}

201 {'processedNames': [{'isCollection': False, 'name': 'Kamala Harris', 'type': 'person'}, {'isCollection': False, 'name': 'Joe Biden', 'type': 'person'}, {'isCollection': False, 'name': 'Donald Trump', 'type': 'person'}, {'isCollection': False, 'name': 'Tim Walz', 'type': 'person'}, {'isCollection': False, 'name': 'J. D. Vance', 'type': 'person'}, {'isCollection': False, 'name': 'ABC News', 'type': 'organization'}, {'isCollection': True, 'name

In [4]:
file_names = ('Al Jazeera', 'Breitbart', 'Christian Science Monitor', 'Economist', 'Financial Express', 'Fox News', 'Globe and Mail',
              'Guardian', 'Hindustan Times', 'Huffington Post', 'Indo-Asian News Service', 'Irish Times', 'Namibian', 'New York Times', 
              'Radio France Internationale', 'Toronto Star', 'USA Today', 'Wall Street Journal', 'Washington Post', 'Washington Times')

international = ('Al Jazeera', 'Economist', 'Financial Express', 'Globe and Mail', 'Guardian', 
                 'Hindustan Times', 'Indo-Asian News Service', 'Irish Times', 'Namibian', 
                 'Radio France Internationale', 'Toronto Star')
# For news (not editorials)
center = ('Christian Science Monitor', 'Wall Street Journal')
conservative = ('Breitbart', 'Fox News', 'Washington Times')
liberal = ('Huffington Post', 'New York Times', 'USA Today', 'Washington Post')

In [5]:
# Function to fully ingest the first 10 sentences of each news article
# "Fully ingest" means that the text is mapped to the DNA ontology
# All sentences and quotes are processed to some extent
def ingest_articles(repository: str): 
    for file_name in file_names:
        try: 
            with open(f'articles/{repository}-articles/{file_name}.txt', 'r') as article:
                article_details = article.read()  
        except:
            continue
        req_dict = dict()
        req_dict['title'] = article_details.split('Title: ')[1].split('\n')[0]
        req_dict['source'] = file_name
        print(file_name)
        req_dict['published'] = article_details.split('Published: ')[1].split('\n')[0]
        req_dict['url'] = article_details.split('URL: ')[1].split('\n')[0]
        req_dict['text'] = article_details.split('Text: ')[1]
        response = requests.post(
            f'http://127.0.0.1:5000/dna/v1/repositories/narratives?repository={repository}&sentences=10',
            json=req_dict)

In [6]:
ingest_articles("abortion")

Al Jazeera
Breitbart
Fox News
Huffington Post
New York Times
Wall Street Journal
Washington Times


In [7]:
# Check what articles are in the abortion repository 
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories/narratives?repository=abortion')
print(response.status_code, json.dumps(response.json(), indent=1))

200 {
 "narratives": [
  {
   "narrativeId": "68725ec9",
   "narrativeMetadata": {
    "published": "2024-04-02T00:00:00",
    "source": "Al Jazeera",
    "title": "Biden denounces Florida abortion ruling as \u2018outrageous\u2019 as state vote looms",
    "url": "https://www.aljazeera.com/news/2024/4/2/biden-denounces-florida-abortion-ruling-as-outrageous-as-state-vote-looms"
   },
   "numberIngested": 10,
   "numberOfSentences": 36,
   "numberOfTriples": 610,
   "processed": "2024-10-14T16:03:34"
  },
  {
   "narrativeId": "900d7d01",
   "narrativeMetadata": {
    "published": "2024-03-25T00:00:00",
    "source": "Breitbart",
    "title": "Florida Supreme Court Upholds 15-Week Abortion Limit, But Voters Will Decide in November",
    "url": "https://www.breitbart.com/politics/2024/04/01/florida-supreme-court-upholds-15-week-abortion-limit-but-voters-will-decide-in-november/"
   },
   "numberIngested": 10,
   "numberOfSentences": 36,
   "numberOfTriples": 593,
   "processed": "2024-10-

In [11]:
ingest_articles("debate")

Al Jazeera
Breitbart
Christian Science Monitor
Economist
Financial Express
Fox News
Globe and Mail
Guardian
Hindustan Times
Indo-Asian News Service
Irish Times
Namibian
New York Times
Radio France Internationale
Toronto Star
USA Today
Wall Street Journal
Washington Post
Washington Times


In [12]:
# Check what articles are in the debate repository 
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories/narratives?repository=debate')
print(response.status_code, json.dumps(response.json(), indent=1))

200 {
 "narratives": [
  {
   "narrativeId": "e52af15d",
   "narrativeMetadata": {
    "published": "2024-09-11T00:00:00",
    "source": "Al Jazeera",
    "title": "Did Harris win the debate or did Trump lose it?",
    "url": "https://www.aljazeera.com/news/2024/9/11/did-harris-win-the-debate-or-did-trump-lose-it"
   },
   "numberIngested": 10,
   "numberOfSentences": 43,
   "numberOfTriples": 742,
   "processed": "2024-10-14T22:57:02"
  },
  {
   "narrativeId": "d777f769",
   "narrativeMetadata": {
    "published": "2024-09-11T00:00:00",
    "source": "Breitbart",
    "title": "Victor Davis Hanson Slams ABC Moderators\u2019 \u2018Shameless Bias\u2019 During Trump-Harris Debate: \u20183-on-1 Pile-On\u2019",
    "url": "https://www.breitbart.com/politics/2024/09/11/victor-davis-hanson-slams-abc-moderators-shameless-bias-during-trump-harris-debate-3-on-1-pile-on/"
   },
   "numberIngested": 10,
   "numberOfSentences": 16,
   "numberOfTriples": 493,
   "processed": "2024-10-14T23:00:33"
 

In [13]:
# Dump of WSJ "abortion" and one "debate" article 
article_dict = {"abortion": "d9c77099", "debate": "3bb0aa67"}
for key, value in article_dict.items():
    response = requests.get(
        f'http://127.0.0.1:5000/dna/v1/repositories/narratives/graphs?repository={key}&narrativeId={value}')
    resp_json = response.json()
    with open(f'dumps/{key}_{value}_details.json', 'w') as detail_out:
        detail_out.write(json.dumps(dict(resp_json['narrativeDetails'])))
    with open(f'dumps/{key}_{value}.ttl', 'w') as triples_out:
        for triple in resp_json['triples']:
            triples_out.write(triple)