# News Rollup bot

Resources https://newsapi.org/docs/client-libraries/python and https://platform.openai.com/docs/api-reference/chat/create

First get an api key and create a .env file using example.env - NewsAPI, OpenAI

In [1]:
# Load environment variables
# pip3 install dotenv

import os
from dotenv import load_dotenv

load_dotenv()

API = os.getenv('API')
GPT = os.getenv('NEWS_GPT')
GIT = os.getenv('GIT')
GNEWS = os.getenv('GNEWS')

In [69]:
# Use NewsAPI to get articles

import requests
import json

#set variables
q = 'data analytics'
domains = 'forbes.com, venturebeat.com, searchbusinessanalytics.techtarget.com, www.informationweek.com/big-data-analytics.asp, www.zdnet.com/topic/big-data-analytics, www.datasciencecentral.com, www.kdnuggets.com, www.analyticsinsight.net, www.datanami.com, Bloomberg' # comma seperated
date_from = '2023-04-25'
date_to = '2023-04-26'
language = 'en'
sortBy = 'relevancy' # relevancy, popularity, publishedAT
pageSize = 5 # set max number of articles you'd like to summarize
page = 1

#make request
# api_url = f"https://newsapi.org/v2/everything?q={q}&apiKey={API}&from={date_from}&to={date_to}&language={language}&sortBy={sortBy}&pageSize={pageSize}&page={page}"
api_url = f"https://newsapi.org/v2/everything?q={q}&apiKey={API}&domains={domains}&from={date_from}&to={date_to}&language={language}&sortBy={sortBy}&pageSize={pageSize}&page={page}"
response = requests.get(api_url)
data = response.json()
print(data)

#write to tmp file
with open('tmp_data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

{'status': 'ok', 'totalResults': 31, 'articles': [{'source': {'id': None, 'name': 'VentureBeat'}, 'author': 'Shubham Sharma', 'title': 'How Prophecy 3.0 enables businesses to build data pipelines without writing SQL code', 'description': 'Prophecy today launched a new version of its core platform to provide enterprises with low-code SQL capabilities for building data pipelines.', 'url': 'https://venturebeat.com/data-infrastructure/how-prophecy-3-0-enables-businesses-to-build-data-pipelines-without-writing-sql-code/', 'urlToImage': 'https://venturebeat.com/wp-content/uploads/2023/04/OIG-1-e1682521927193.jpeg?w=1200&strip=all', 'publishedAt': '2023-04-26T16:24:49Z', 'content': 'Join top executives in San Francisco on July 11-12, to hear how leaders are integrating and optimizing AI investments for success. Learn More\r\nCalifornia-based data engineering company Prophecy has a… [+1174 chars]'}, {'source': {'id': None, 'name': 'Forbes'}, 'author': 'Luis Marinelli, Forbes Councils Member, \

In [72]:
# Testing GNews.io to replace NewsAPI

# https://docs.python.org/3/library/json.html
# This library will be used to parse the JSON data returned by the API.
import json
# https://docs.python.org/3/library/urllib.request.html#module-urllib.request
# This library will be used to fetch the API.
import urllib.request

date = '2023-04-27'

q = 'data&analytics'
q_in = 'content'
d_from = date
d_to = date
sortby = 'relevance'

apikey = GNEWS
url = f'https://gnews.io/api/v4/search?q={q}&lang=en&country=us&max=10&apikey={apikey}&from={d_from}&to={d_to}&sortby={sortby}&in={q_in}'

with urllib.request.urlopen(url) as response:
    data = json.loads(response.read().decode("utf-8"))
    articles = data["articles"]

    for i in range(len(articles)):
        # articles[i].title
        print(f"Title: {articles[i]['title']}")
        # articles[i].description
        print(f"Description: {articles[i]['description']}")
        # You can replace {property} below with any of the article properties returned by the API.
        # articles[i].{property}
        # print(f"{articles[i]['{property}']}")

        # Delete this line to display all the articles returned by the request. Currently only the first article is displayed.
        break

Title: How data brokers get your personal information.
Description: And sell that data to the highest bidder.


In [64]:
# testing json load
# with open('tmp_data.json') as data_tmp_file:
#     data_string = data_tmp_file.read()
# data = json.loads(data_string)

# print(data)

In [13]:
# parse response for URLs and summarize with GPT
# pip3 install openai

import os
import openai
import json

# read and parse json from tmp
with open('tmp_data.json') as data_tmp_file:
    data_string = data_tmp_file.read()
data = json.loads(data_string)
    
openai.api_key = GPT

for i in data['articles']:
    article = i['url']
    content = f"Provide a summary of this article in 3 bullet points (format 1, 2, 3). Condense the response and remove fillers like 'the article' and 'the author':{article}"
    completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": content}])

    # append summary(completion) to data
    summary = completion.choices[0].message.content
    i['summary'] = summary 

    # print(completion.choices[0].message.content)

# write summarized output to file
with open('tmp_summarized_data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)


In [59]:
# Testing for modifying and writing out json

print(data['articles'][0])
print(completion.choices[0].message.content)

summary = completion.choices[0].message.content
data['articles'][0]['summary'] = summary

print(data['articles'][0])

with open('tmp_summarized_data.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)


{'source': {'id': None, 'name': 'Forbes'}, 'author': 'Bernd Greifeneder, Forbes Councils Member, \n Bernd Greifeneder, Forbes Councils Member\n https://www.forbes.com/sites/forbestechcouncil/people/berndgreifeneder/', 'title': 'Getting Tool Sprawl Under Control To Enable Data-Driven Business And Cloud-Scale Growth', 'description': 'Supported by a data lakehouse, organizations could unlock the value of the petabytes—and, eventually, yottabytes—of data they have available to access more precise real-time answers to the questions that matter. This can help them not just survive but thrive …', 'url': 'https://www.forbes.com/sites/forbestechcouncil/2023/04/24/getting-tool-sprawl-under-control-to-enable-data-driven-business-and-cloud-scale-growth/', 'urlToImage': 'https://imageio.forbes.com/specials-images/imageserve/6399d93eaccb6518c9a6c001/0x0.jpg?format=jpg&width=1200', 'publishedAt': '2023-04-24T11:00:00Z', 'content': 'Bernd Greifeneder is the CTO and founder of Dynatrace, a software int

Now we have a summary of the latest news articles. Time to write that to a post. 

I currently have a portfolio site that creates blog posts out of markdown files so workflow should be:
    1. Pull relevant information from tmp_summarized_data.json 
    2. Add a section for each news article including: Title, Source(Linked), Summary
    3. Save that markdown file to a remote github repo

In [14]:
# Write a markdown file to be used as the daily post

with open('tmp_summarized_data.json') as data_tmp_file:
    data_string = data_tmp_file.read()
data = json.loads(data_string)

# Write top of post file
date = date_from
topic = 'Data Analytics'

top = f"---\ntitle: Rollup for {date}\ndate: {date}\ndescription: GPT generated news rollup for {date}\ntag: {topic}\nauthor: System\n---\n# {topic} news for {date}\n"

# Parse data from summarized posts and put into markdown format
post = ''

for i in data['articles']:
    #get variables from json
    title = i['title']
    source = i['source']['name']
    url = i['url']
    summary = i['summary']

    #write variables to file
    md_title = '### '+title
    md_credit = '_['+source+']('+url+')_'
    md_summary = summary
    
    post += f'{md_title}\n{md_credit}\n{md_summary}\n'
    
# print(post)

# write the markdown file
f = open('tmp_newpost.mdx','w')
f.write(
    top +
    post
)
f.close()


In [26]:
# Save file to local source
import os.path

folder = '/Users/mattray/Documents/GitHub/mattray-xyz/pages/posts' #this is the folder hosting my blog
file_name = date+'.mdx'
file_path = os.path.join(folder, file_name)

with open('tmp_newpost.mdx', 'r') as f1, open(file_path, 'w') as f2:
    post = f1.read()
    f2.write(post)

In [36]:
# Post request returns 201
import requests
from base64 import b64encode

repo = 'mattray-xyz'
path = 'pages/posts'
owner = 'mdray'
date = '2023-04-26'



# Base64 endoce new post
with open('tmp_newpost.mdx', 'r') as file:
    data = file.read()

data_bytes = data.encode('utf8')
# print(data_bytes)
data_b64 = b64encode(data_bytes)
# print(data_b64)
b64 = data_b64.decode('utf8')
# print(b64)

# Write GitHub API variables
header = {'Authorization':'Bearer ' +GIT, "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28"}
url = f'https://api.github.com/repos/{owner}/{repo}/contents/{path}/{date}.mdx'
message = {'message':'my commit message', 'content':f'{b64}'}

response = requests.put(url=url, headers=header, json=message)
print(response)
# print(response.request.url)
print(response.request.body)
# print(response.request.headers) # Contains passwords

<Response [201]>
b'{"message": "my commit message", "content": "LS0tCnRpdGxlOiBSb2xsdXAgZm9yIDIwMjMtMDQtMjUKZGF0ZTogMjAyMy0wNC0yNQpkZXNjcmlwdGlvbjogR1BUIGdlbmVyYXRlZCBuZXdzIHJvbGx1cCBmb3IgMjAyMy0wNC0yNQp0YWc6IERhdGEgQW5hbHl0aWNzCmF1dGhvcjogU3lzdGVtCi0tLQojIERhdGEgQW5hbHl0aWNzIG5ld3MgZm9yIDIwMjMtMDQtMjUKIyMjIFVubG9ja2luZyBMZW5kaW5nIFdpdGggQ3JlZG9yYeKAmXMgUHJpdmFjeSBQcmVzZXJ2aW5nIENyZWRpdCBTY29yaW5nCl9bRm9yYmVzXShodHRwczovL3d3dy5mb3JiZXMuY29tL3NpdGVzL2RhdmlkcHJvc3Nlci8yMDIzLzA0LzI1L3VubG9ja2luZy1sZW5kaW5nLXdpdGgtY3JlZG9yYXMtcHJpdmFjeS1wcmVzZXJ2aW5nLWNyZWRpdC1zY29yaW5nLylfCjEuIENyZWRvcmEgaXMgYSBmaW50ZWNoIGZpcm0gdGhhdCBvZmZlcnMgcHJpdmFjeS1wcmVzZXJ2aW5nIGNyZWRpdCBzY29yaW5nIGJ5IHVzaW5nIGRhdGEgZW5jcnlwdGlvbi4KMi4gQnkgdXRpbGl6aW5nIGVuY3J5cHRlZCBkYXRhLCBDcmVkb3JhIHByb3ZpZGVzIHVuaXF1ZSBpbnNpZ2h0cyBpbnRvIHRoZSBjcmVkaXR3b3J0aGluZXNzIG9mIGluZGl2aWR1YWxzIHdpdGhvdXQgcHV0dGluZyB0aGVpciBwZXJzb25hbCBkYXRhIGF0IHJpc2suCjMuIFRoZSBmaW50ZWNoIGZpcm0gaXMgcG9pc2VkIHRvIHJldm9sdXRpb25pemUgdGhlIGxlbmRpbmcgaW5kdXN0

In [56]:
# Testing Github API
# Get request returns 200
import requests

repo = 'mattray-xyz'
path = 'pages/posts'
owner = 'mdray'

header = {'Authorization': GIT}
url_get = f'https://api.github.com/repos/{owner}/{repo}/contents/{path}'
message = {'message':'my commit message', 'content':'bXkgbmV3IGZpbGUgY29udGVudHM='}

# print(message)

# with open('tmp_newpost.mdx', 'r') as file:
#     content = file.read()

response = requests.get(url=url_get, headers=header)
print(response)
# print(response.request.url)
# print(response.request.body)
# print(response.request.headers) # Contains passwords

<Response [200]>


In [34]:
# Playing with base64 encoding the post

with open('tmp_newpost.mdx', 'r') as file:
    post = file.read()
# print(post)


from base64 import b64encode
data = "test"

data_bytes = data.encode('utf8')
print(data_bytes)

data_b64 = b64encode(data_bytes)
print(data_b64)

b64 = data_b64.decode('utf8')
print(b64)

b'test'
b'dGVzdA=='
dGVzdA==


In [80]:
from datetime import date, timedelta

day = date.today() - timedelta(days = 1)
print(day)

2023-04-26
