## Input several news articles into the Stardog DNA repositories

There are 7 articles from different sources on the Florida Supreme Court ruling on banning abortions after 6 weeks (in the directory, articles/abortion-articles). And, there are 20 articles from different sources on the topics of the Trump-Harris presidential debate held on September 11 2024 (in the directory, articles/debate-articles).

Two repositories are created ("abortion" and "debate") with background data added to each. Then, the articles' sentences are converted to RDF, complying with the DNA Event Ontology, and ingested into the 'dna' Stardog Cloud repository.

To execute this notebook, make sure that: 
* The DNA application's flask server is running (execute 'flask run' from the dna directory)

In [1]:
import json
import requests

In [2]:
# Delete the repositories to guarantee a clean slate
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=abortion')
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=debate')
# Create the repositories for the news articles
response = requests.post('http://127.0.0.1:5000/dna/v1/repositories?repository=abortion')
response = requests.post('http://127.0.0.1:5000/dna/v1/repositories?repository=debate')
# Verify that the repositories are created
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories')
print(response.status_code, response.json()) 

200 [{'created': '2024-10-04T15:21:22', 'repository': 'debate'}, {'created': '2024-10-04T15:21:21', 'repository': 'abortion'}]


In [3]:
# Add background data
abortion_background = {
  "backgroundNames": [
    {"name": "Florida Supreme Court", "type": "organization", "isCollection": False},
    {"name": "Florida", "type": "place", "isCollection": False},
    {"name": "Joe Biden", "type": "person", "isCollection": False},
    {"name": "Donald Trump", "type": "person", "isCollection": False},
    {"name": "Roe v. Wade", "type": "law", "isCollection": False},
    {"name": "Democrat", "type": "norp", "isCollection": False},
    {"name": "Republican", "type": "norp", "isCollection": False}
  ]
}
response = requests.post(f'http://127.0.0.1:5000/dna/v1/repositories/background?repository=abortion', 
                         json=abortion_background)
print(response.status_code, response.json()) 

print()

debate_background = {
  "backgroundNames": [
    {"name": "Kamala Harris", "type": "person", "isCollection": False},
    {"name": "Joe Biden", "type": "person", "isCollection": False},
    {"name": "Donald Trump", "type": "person", "isCollection": False},
    {"name": "Tim Walz", "type": "person", "isCollection": False},
    {"name": "J. D. Vance", "type": "person", "isCollection": False},
    {"name": "ABC News", "type": "organization", "isCollection": False},
    {"name": "ABC moderators", "type": "person", "isCollection": True}, 
    {"name": "Democrat", "type": "norp", "isCollection": False},
    {"name": "Republican", "type": "norp", "isCollection": False}
  ]
}
response = requests.post(f'http://127.0.0.1:5000/dna/v1/repositories/background?repository=debate', 
                         json=debate_background)
print(response.status_code, response.json()) 



201 {'processedNames': [{'isCollection': False, 'name': 'Florida Supreme Court', 'type': 'organization'}, {'isCollection': False, 'name': 'Florida', 'type': 'place'}, {'isCollection': False, 'name': 'Joe Biden', 'type': 'person'}, {'isCollection': False, 'name': 'Donald Trump', 'type': 'person'}, {'isCollection': False, 'name': 'Roe v. Wade', 'type': 'law'}, {'isCollection': False, 'name': 'Democrat', 'type': 'norp'}, {'isCollection': False, 'name': 'Republican', 'type': 'norp'}], 'repository': 'abortion', 'skippedNames': []}

201 {'processedNames': [{'isCollection': False, 'name': 'Kamala Harris', 'type': 'person'}, {'isCollection': False, 'name': 'Joe Biden', 'type': 'person'}, {'isCollection': False, 'name': 'Donald Trump', 'type': 'person'}, {'isCollection': False, 'name': 'Tim Walz', 'type': 'person'}, {'isCollection': False, 'name': 'J. D. Vance', 'type': 'person'}, {'isCollection': False, 'name': 'ABC News', 'type': 'organization'}, {'isCollection': True, 'name': 'ABC moderators

In [4]:
file_names = ('Al Jazeera', 'Breitbart', 'Christian Science Monitor', 'Economist', 'Financial Express', 'Fox News', 'Globe and Mail',
              'Guardian', 'Hindustan Times', 'Indo-Asian News Service', 'Irish Times', 'Namibian', 'New York Times', 'Press TV', 
              'Radio France Internationale', 'Toronto Star', 'USA Today', 'Washington Post', 'Washington Times', 'Wall Street Journal')

international = ('Al Jazeera', 'Economist', 'Financial Express', 'Globe and Mail', 'Guardian', 
                 'Hindustan Times', 'Indo-Asian News Service', 'Irish Times', 'Namibian', 'Press TV', 
                 'Radio France Internationale', 'Toronto Star')
center = ('Christian Science Monitor', 'Wall Street Journal')
conservative = ('Breitbart', 'Fox News', 'Washington Times')
liberal = ('New York Times', 'USA Today', 'Washington Post')

In [5]:
# Function to fully ingest the first 10 sentences of each news article
# "Fully ingest" means that the text is mapped to the DNA ontology
# All sentences and quotes are processed to some extent
def ingest_articles(repository: str): 
    for file_name in file_names:
        try: 
            with open(f'articles/{repository}-articles/{file_name}.txt', 'r') as article:
                article_details = article.read()  
        except:
            continue
        req_dict = dict()
        req_dict['title'] = article_details.split('Title: ')[1].split('\n')[0]
        req_dict['source'] = file_name
        req_dict['published'] = article_details.split('Published: ')[1].split('\n')[0]
        req_dict['url'] = article_details.split('URL: ')[1].split('\n')[0]
        req_dict['text'] = article_details.split('Text: ')[1]
        response = requests.post(
            f'http://127.0.0.1:5000/dna/v1/repositories/narratives?repository={repository}&sentences=10',
            json=req_dict)v
        print()

In [6]:
ingest_articles("abortion")

<Response [201]>
201 {'narrativeDetails': {'narrativeId': '6665da8e', 'narrativeMetadata': {'published': '2024-04-02T00:00:00', 'source': 'Al Jazeera', 'title': 'Biden denounces Florida abortion ruling as ‘outrageous’ as state vote looms', 'url': 'https://www.aljazeera.com/news/2024/4/2/biden-denounces-florida-abortion-ruling-as-outrageous-as-state-vote-looms'}, 'numberIngested': 10, 'numberOfSentences': 36, 'numberOfTriples': 559, 'processed': '2024-10-04T15:25:35'}, 'repository': 'abortion'}

<Response [201]>
201 {'narrativeDetails': {'narrativeId': 'd859a432', 'narrativeMetadata': {'published': '2024-03-25T00:00:00', 'source': 'Breitbart', 'title': 'Florida Supreme Court Upholds 15-Week Abortion Limit, But Voters Will Decide in November', 'url': 'https://www.breitbart.com/politics/2024/04/01/florida-supreme-court-upholds-15-week-abortion-limit-but-voters-will-decide-in-november/'}, 'numberIngested': 10, 'numberOfSentences': 36, 'numberOfTriples': 542, 'processed': '2024-10-04T15:28:

In [7]:
# Check what articles are in the abortion repository 
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories/narratives?repository=abortion')
print(response.status_code, json.dumps(response.json(), indent=1))

200 {
 "narratives": [
  {
   "narrativeId": "6665da8e",
   "narrativeMetadata": {
    "published": "2024-04-02T00:00:00",
    "source": "Al Jazeera",
    "title": "Biden denounces Florida abortion ruling as \u2018outrageous\u2019 as state vote looms",
    "url": "https://www.aljazeera.com/news/2024/4/2/biden-denounces-florida-abortion-ruling-as-outrageous-as-state-vote-looms"
   },
   "numberIngested": 10,
   "numberOfSentences": 36,
   "numberOfTriples": 559,
   "processed": "2024-10-04T15:25:35"
  },
  {
   "narrativeId": "fc533f78",
   "narrativeMetadata": {
    "published": "2024-04-02T00:00:00",
    "source": "Al Jazeera",
    "title": "Biden denounces Florida abortion ruling as \u2018outrageous\u2019 as state vote looms",
    "url": "https://www.aljazeera.com/news/2024/4/2/biden-denounces-florida-abortion-ruling-as-outrageous-as-state-vote-looms"
   },
   "numberIngested": 10,
   "numberOfSentences": 36,
   "numberOfTriples": 531,
   "processed": "2024-10-04T15:26:50"
  },
  {
 

In [9]:
# Redo due to error
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=debate')
response = requests.post('http://127.0.0.1:5000/dna/v1/repositories?repository=debate')
response = requests.post(f'http://127.0.0.1:5000/dna/v1/repositories/background?repository=debate', 
                         json=debate_background)

In [10]:
ingest_articles("debate")

<Response [201]>
201 {'narrativeDetails': {'narrativeId': '5f139ebb', 'narrativeMetadata': {'published': '2024-09-11T00:00:00', 'source': 'Al Jazeera', 'title': 'Did Harris win the debate or did Trump lose it?', 'url': 'https://www.aljazeera.com/news/2024/9/11/did-harris-win-the-debate-or-did-trump-lose-it'}, 'numberIngested': 10, 'numberOfSentences': 43, 'numberOfTriples': 699, 'processed': '2024-10-04T15:53:09'}, 'repository': 'debate'}

<Response [201]>
201 {'narrativeDetails': {'narrativeId': '6cb5ee11', 'narrativeMetadata': {'published': '2024-09-11T00:00:00', 'source': 'Breitbart', 'title': 'Victor Davis Hanson Slams ABC Moderators’ ‘Shameless Bias’ During Trump-Harris Debate: ‘3-on-1 Pile-On’', 'url': 'https://www.breitbart.com/politics/2024/09/11/victor-davis-hanson-slams-abc-moderators-shameless-bias-during-trump-harris-debate-3-on-1-pile-on/'}, 'numberIngested': 10, 'numberOfSentences': 16, 'numberOfTriples': 488, 'processed': '2024-10-04T15:55:43'}, 'repository': 'debate'}



In [11]:
# Check what articles are in the debate repository 
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories/narratives?repository=debate')
print(response.status_code, json.dumps(response.json(), indent=1))

200 {
 "narratives": [
  {
   "narrativeId": "5f139ebb",
   "narrativeMetadata": {
    "published": "2024-09-11T00:00:00",
    "source": "Al Jazeera",
    "title": "Did Harris win the debate or did Trump lose it?",
    "url": "https://www.aljazeera.com/news/2024/9/11/did-harris-win-the-debate-or-did-trump-lose-it"
   },
   "numberIngested": 10,
   "numberOfSentences": 43,
   "numberOfTriples": 699,
   "processed": "2024-10-04T15:53:09"
  },
  {
   "narrativeId": "6cb5ee11",
   "narrativeMetadata": {
    "published": "2024-09-11T00:00:00",
    "source": "Breitbart",
    "title": "Victor Davis Hanson Slams ABC Moderators\u2019 \u2018Shameless Bias\u2019 During Trump-Harris Debate: \u20183-on-1 Pile-On\u2019",
    "url": "https://www.breitbart.com/politics/2024/09/11/victor-davis-hanson-slams-abc-moderators-shameless-bias-during-trump-harris-debate-3-on-1-pile-on/"
   },
   "numberIngested": 10,
   "numberOfSentences": 16,
   "numberOfTriples": 488,
   "processed": "2024-10-04T15:55:43"
 

In [13]:
# Ingest the individual WSJ article that failed due to mishandling of double quotation marks (corrected)
with open(f'articles/debate-articles/Wall Street Journal.txt', 'r') as article:
    article_details = article.read()  
req_dict = dict()
req_dict['title'] = article_details.split('Title: ')[1].split('\n')[0]
req_dict['source'] = "Wall Street Journal"
req_dict['published'] = article_details.split('Published: ')[1].split('\n')[0]
req_dict['url'] = article_details.split('URL: ')[1].split('\n')[0]
req_dict['text'] = article_details.split('Text: ')[1]
response = requests.post(
    f'http://127.0.0.1:5000/dna/v1/repositories/narratives?repository=debate&sentences=10',
    json=req_dict)
print(response)
print(response.status_code, response.json())

<Response [201]>
201 {'narrativeDetails': {'narrativeId': '548b12b3', 'narrativeMetadata': {'published': '2024-09-11T00:00:00', 'source': 'Wall Street Journal', 'title': 'Harris Baits Trump in Fiery Presidential Debate', 'url': 'https://www.wsj.com/politics/elections/trump-and-harris-meet-for-high-stakes-presidential-debate-ae8719e7'}, 'numberIngested': 10, 'numberOfSentences': 65, 'numberOfTriples': 899, 'processed': '2024-10-05T13:00:02'}, 'repository': 'debate'}


In [14]:
# Check what articles are in the debate repository 
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories/narratives?repository=debate')
print(response.status_code, json.dumps(response.json(), indent=1))

200 {
 "narratives": [
  {
   "narrativeId": "5f139ebb",
   "narrativeMetadata": {
    "published": "2024-09-11T00:00:00",
    "source": "Al Jazeera",
    "title": "Did Harris win the debate or did Trump lose it?",
    "url": "https://www.aljazeera.com/news/2024/9/11/did-harris-win-the-debate-or-did-trump-lose-it"
   },
   "numberIngested": 10,
   "numberOfSentences": 43,
   "numberOfTriples": 699,
   "processed": "2024-10-04T15:53:09"
  },
  {
   "narrativeId": "6cb5ee11",
   "narrativeMetadata": {
    "published": "2024-09-11T00:00:00",
    "source": "Breitbart",
    "title": "Victor Davis Hanson Slams ABC Moderators\u2019 \u2018Shameless Bias\u2019 During Trump-Harris Debate: \u20183-on-1 Pile-On\u2019",
    "url": "https://www.breitbart.com/politics/2024/09/11/victor-davis-hanson-slams-abc-moderators-shameless-bias-during-trump-harris-debate-3-on-1-pile-on/"
   },
   "numberIngested": 10,
   "numberOfSentences": 16,
   "numberOfTriples": 488,
   "processed": "2024-10-04T15:55:43"
 

In [17]:
# Dump of one "abortion" and one "debate" article 
article_dict = {"abortion": "ba31f8e8", "debate": "548b12b3"}
for key, value in article_dict.items():
    response = requests.get(
        f'http://127.0.0.1:5000/dna/v1/repositories/narratives/graphs?repository={key}&narrativeId={value}')
    resp_json = response.json()
    with open(f'dumps/{key}_{value}_details.json', 'w') as detail_out:
        detail_out.write(json.dumps(dict(resp_json['narrativeDetails'])))
    with open(f'dumps/{key}_{value}.ttl', 'w') as triples_out:
        for triple in resp_json['triples']:
            triples_out.write(triple)