## Input news articles into the Stardog DNA repositories

There are 8 articles from different sources on the 2024 presidential election results. They are stored in the directory, articles/election.

A repository ("election") is created with background data. Then, the articles' sentences are converted to RDF, using the DNA Ontology, and ingested into the 'dna' Stardog Cloud repository.

To execute this notebook, make sure that: 
* The DNA application's flask server is running (execute 'flask run' from the dna directory)

In [1]:
import json
import requests

In [2]:
# Delete the previous repositories to guarantee a clean slate
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=election')
# Delete the test repository, foo
response = requests.delete('http://127.0.0.1:5000/dna/v1/repositories?repository=foo')

# Create the repositories for the news articles
response = requests.post('http://127.0.0.1:5000/dna/v1/repositories?repository=election')
# Verify that the repository is created
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories')
print(response.status_code, response.json()) 

200 [{'created': '2024-12-16T03:04:51', 'repository': 'election'}]


In [3]:
# Add background data
election_background = {
  "backgroundNames": [
    {"name": "Kamala Harris", "type": "person", "isCollection": False},  
    {"name": "Joe Biden", "type": "person", "isCollection": False},  
    {"name": "Donald Trump", "type": "person", "isCollection": False}, 
    {"name": "JD Vance", "type": "person", "isCollection": False},   
    {"name": "Tim Walz", "type": "person", "isCollection": False},  
    {"name": "Roe vs Wade", "type": "law", "alsoKnownAs": ["Roe", "Roe v Wade"], "isCollection": False},  
    {"name": "Democratic Party", "type": "norp", "isCollection": False},  
    {"name": "Republican Party", "type": "norp", "isCollection": False},  
    {"name": "blue States", "type": "place", "isCollection": True},   
    {"name": "red States", "type": "place", "isCollection": True}
  ]
}
response = requests.post(f'http://127.0.0.1:5000/dna/v1/repositories/background?repository=election', 
                         json=election_background)
print(response.status_code, response.json()) 


201 {'processedNames': [{'isCollection': False, 'name': 'Kamala Harris', 'type': 'person'}, {'isCollection': False, 'name': 'Joe Biden', 'type': 'person'}, {'isCollection': False, 'name': 'Donald Trump', 'type': 'person'}, {'isCollection': False, 'name': 'JD Vance', 'type': 'person'}, {'isCollection': False, 'name': 'Tim Walz', 'type': 'person'}, {'alsoKnownAs': ['Roe', 'Roe v Wade'], 'isCollection': False, 'name': 'Roe vs Wade', 'type': 'law'}, {'isCollection': False, 'name': 'Democratic Party', 'type': 'norp'}, {'isCollection': False, 'name': 'Republican Party', 'type': 'norp'}, {'isCollection': True, 'name': 'blue States', 'type': 'place'}, {'isCollection': True, 'name': 'red States', 'type': 'place'}], 'repository': 'election', 'skippedNames': []}


In [4]:
file_names = ('Al Jazeera', 'Breitbart', 'Christian Science Monitor', 'Economist', 'Financial Express', 'Fox News', 'Globe and Mail',
              'Guardian', 'Hindustan Times', 'Huffington Post', 'Indo-Asian News Service', 'Irish Times', 'Namibian', 'New York Times', 
              'Toronto Star', 'USA Today', 'Wall Street Journal', 'Washington Post', 'Washington Times')

international = ('Al Jazeera', 'Economist', 'Financial Express', 'Globe and Mail', 'Guardian', 
                 'Hindustan Times', 'Indo-Asian News Service', 'Irish Times', 'Namibian', 'Toronto Star')
# Classification for news (not editorials)
center = ('Christian Science Monitor', 'Wall Street Journal', 'Economist')
conservative = ('Breitbart', 'Fox News', 'Washington Times')
liberal = ('Huffington Post', 'New York Times', 'USA Today', 'Washington Post')

In [5]:
# Function to ingest each news article, up to 100 sentences
def ingest_articles(repository: str): 
    for file_name in file_names:
        try: 
            with open(f'articles/{repository}-articles/{file_name}.txt', 'r') as article:
                article_details = article.read()  
        except:
            continue
        req_dict = dict()
        req_dict['title'] = article_details.split('Title: ')[1].split('\n')[0]
        req_dict['source'] = file_name
        print(file_name)
        req_dict['published'] = article_details.split('Published: ')[1].split('\n')[0]
        req_dict['url'] = article_details.split('URL: ')[1].split('\n')[0]
        req_dict['text'] = article_details.split('Text: ')[1]
        response = requests.post(
            f'http://127.0.0.1:5000/dna/v1/repositories/narratives?repository={repository}&sentences=100',
            json=req_dict)

In [6]:
%%time

ingest_articles("election")

Breitbart
Economist
Fox News
Huffington Post
New York Times
USA Today
Wall Street Journal
Washington Times
CPU times: user 124 ms, sys: 55.7 ms, total: 180 ms
Wall time: 44min 24s


In [7]:
# Check what articles are in the election repository 
response = requests.get('http://127.0.0.1:5000/dna/v1/repositories/narratives?repository=election')
print(response.status_code, json.dumps(response.json(), indent=1))

200 {
 "narratives": [
  {
   "narrativeId": "372e5b83",
   "narrativeMetadata": {
    "published": "2024-11-05T00:00:00",
    "source": "Breitbart",
    "title": "Donald Trump Wins the Presidency; Greatest Comeback in American History",
    "url": "https://www.breitbart.com/2024-election/2024/11/05/donald-trump-wins-the-presidency-greatest-comeback-in-american-history/"
   },
   "numberIngested": 23,
   "numberOfSentences": 23,
   "numberOfTriples": 962,
   "processed": "2024-12-16T03:05:08"
  },
  {
   "narrativeId": "6c64aa09",
   "narrativeMetadata": {
    "published": "2024-11-06T00:00:00",
    "source": "Economist",
    "title": "Donald Trump wins big and fast",
    "url": "https://www.economist.com/united-states/2024/11/06/donald-trump-wins-big-and-fast"
   },
   "numberIngested": 33,
   "numberOfSentences": 33,
   "numberOfTriples": 1206,
   "processed": "2024-12-16T03:09:11"
  },
  {
   "narrativeId": "d43647ec",
   "narrativeMetadata": {
    "published": "2024-11-06T00:00:00"

In [8]:
# Dump of WSJ "election" article 
article_dict = {"election": "d53da686"}
for key, value in article_dict.items():
    response = requests.get(
        f'http://127.0.0.1:5000/dna/v1/repositories/narratives/graphs?repository={key}&narrativeId={value}')
    resp_json = response.json()
    with open(f'dumps/{key}_{value}_details.json', 'w') as detail_out:
        detail_out.write(json.dumps(dict(resp_json['narrativeDetails'])))
    with open(f'dumps/{key}_{value}.ttl', 'w') as triples_out:
        for triple in resp_json['triples']:
            triples_out.write(triple)