In [3]:
import requests
import json
import urllib
import os, sys, getopt
import logging
from dotenv import find_dotenv, load_dotenv

In [39]:
# load env variables
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

FEEDLY_USER_ID = os.environ.get("FEEDLY_USER_ID")
FEEDLY_ACCESS_TOKEN = os.environ.get("FEEDLY_ACCESS_TOKEN")

headers = {'Authorization': 'OAuth ' + FEEDLY_ACCESS_TOKEN}
errorHeaders = {"Authorization": headers["Authorization"].replace("1", "2")}

### Subscriptions

In [48]:
def get_subscriptions(headers):    
    '''Get all subscriptions for current user.

    Parameters
    ----------
    headers: dict
        Authenication information for feedddly API

    Returns
    -------
    list of dictionaries
        list of all subscriptions with metadata 
    '''
    service = "http://cloud.feedly.com/v3/subscriptions/"
    res = requests.get(url=service, headers=headers)
    
    assert res.status_code == 200, "API Call returned bad result (Code %s)" %res.status_code
    return res.json() 

In [53]:
subscriptions = get_subscriptions(headers)
subscriptions

[{'categories': [{'id': 'user/4f464ff9-623e-45a2-8bde-3f1829eae9e9/category/DS Blogs',
    'label': 'DS Blogs'}],
  'contentType': 'longform',
  'id': 'feed/http://www.countbayesie.com/blog?format=RSS',
  'partial': False,
  'subscribers': 1620,
  'title': 'Count Bayesie - A Probability Blog',
  'topics': ['data science', 'statistics', 'math', 'science'],
  'updated': 1548950317488,
  'velocity': 0.2,
  'website': 'http://www.countbayesie.com/'},
 {'categories': [{'id': 'user/4f464ff9-623e-45a2-8bde-3f1829eae9e9/category/DS Blogs',
    'label': 'DS Blogs'}],
  'contentType': 'longform',
  'coverColor': 'C0DEED',
  'iconUrl': 'http://storage.googleapis.com/site-assets/HnIq4cYv6tJeEHJhTSUl9xfp8bciGy9n6sB3rUnBcTk_icon-15414a6fb2b',
  'id': 'feed/http://toddwschneider.com/atom.xml',
  'partial': False,
  'state': 'dormant',
  'subscribers': 488,
  'title': 'Todd W. Schneider',
  'topics': ['data science',
   'data',
   'programming',
   'science',
   'tech',
   'economics'],
  'updated': 1

In [31]:
headers["Authorization"] = headers["Authorization"].replace("1", "2")

In [9]:
err = {'errorCode': 401,
 'errorId': 'ap5int-sv2.2019020412.346719',
 'errorMessage': 'token expired: 1541548800000 (-7761946)'}

In [11]:
err["errorCode"]

401

### Categories

In [12]:
# get all categories
def get_categories(headers):   
    '''Get all categories for current user

    Parameters
    ----------
    headers: dict
        Authenication information for feedly API

    Returns
    -------
    list of dictionaries
        list of categories with metadata 
    '''
    service = "http://cloud.feedly.com/v3/categories/"
    res = requests.get(url=service, headers=headers)
    
    return res.json()

In [13]:
categories = get_categories(headers)
categories

[{'created': 1532164162834,
  'id': 'user/4f464ff9-623e-45a2-8bde-3f1829eae9e9/category/2a8c9142-29af-4dd6-9bc0-8f5861e766de',
  'label': 'Real Estate News'},
 {'created': 1532175354117,
  'id': 'user/4f464ff9-623e-45a2-8bde-3f1829eae9e9/category/7a329076-9761-474f-b587-1c623db67516',
  'label': 'Fun'},
 {'id': 'user/4f464ff9-623e-45a2-8bde-3f1829eae9e9/category/DS Blogs',
  'label': 'DS Blogs'},
 {'id': 'user/4f464ff9-623e-45a2-8bde-3f1829eae9e9/category/News',
  'label': 'News'},
 {'id': 'user/4f464ff9-623e-45a2-8bde-3f1829eae9e9/category/global.must',
  'label': 'Must Read'}]

### Feed Metadata

In [15]:
def get_feed_metadata(feedURI, headers):
    '''Get metadata such as Topic, language, number of subscribers etc. on a given feed

    Parameters
    ----------
    feedURI: str
        Feed Identifier (e.g. 'feed/http://abc.ch/xy')
    headers: dict
        Authenication information for feedly API
    Returns
    -------
    dict
        dictionary containing metadata on given feed
    '''
    feedURI = urllib.parse.quote(feedURI, safe="")
    service = "http://cloud.feedly.com/v3/feeds/"
    res = requests.get(url=service+feedURI, 
                       headers=headers)
    return res.json()

In [16]:
feedID = "feed/http://feeds.feedburner.com/dkriesel-de"
feedMetaData = get_feed_metadata(feedID, headers)
feedMetaData

{'contentType': 'article',
 'feedId': 'feed/http://feeds.feedburner.com/dkriesel-de',
 'iconUrl': 'http://storage.googleapis.com/site-assets/FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk_sicon-154270f726b',
 'id': 'feed/http://feeds.feedburner.com/dkriesel-de',
 'language': 'de',
 'partial': False,
 'subscribers': 654,
 'title': 'D. Kriesel',
 'topics': ['tech', 'news', 'fun'],
 'updated': 1531033029602,
 'velocity': 0.1,
 'visualUrl': 'http://storage.googleapis.com/site-assets/FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk_svisual-154270f726b',
 'website': 'http://www.dkriesel.com/'}

### Streams

In [17]:
def get_stream_ids(streamID, headers, count=100, continuation=None):
    '''Get stream ids for a given feedID

    Parameters
    ----------
    streamID: str
        stream identifier (e.g. 'feed/http://abc.ch/xy')
    headers: dict
        Authenication information for feedly API
        
    Returns
    -------
    list
        list of dictionarys 
    '''
    query = {"streamId": streamID,
            "count": count}
    service = "http://cloud.feedly.com/v3/streams/ids?"
    
    res = requests.get(url=service, params=query, headers=headers)
    return res.json()

In [18]:
get_stream_ids("feed/http://feeds.feedburner.com/dkriesel-de", headers, count=10)

{'continuation': '1594c97d95c:ea848d:ebaea9dd',
 'ids': ['FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_16478ae0be2:1e4e43f:b83afde1',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_1638758c563:11b93bc:abeb7260',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_15fdf18cfcd:1ab5d4:1cefc59',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_15eadf98f76:69985:803daed2',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_15e4d5d45a4:17124e8:81bdf694',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_15e47f90162:f23c5c:57a3f9c6',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_15dbc2fa576:182d20:6c0c5382',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_15b2b2bbcd4:240aaf0:854e2c06',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_159611c9382:2d06d07:ebaea9dd',
  'FfO7w7dVd7aWIlbEPQOM1YRKE18F6oqPLuWxonhoupk=_1594c97d95c:ea848d:ebaea9dd']}

In [19]:
def get_feed_content(streamID, headers, count=100, continuation=None):
    '''Get contents of a given feed

    Parameters
    ----------
    streamID: str
        stream identifier (e.g. 'feed/http://abc.ch/xy')
    headers: dict
        Authenication information for feedly API
    count: int
        number of results to return
    continuation:
        continuation-string from previous call
        
    Returns
    -------
    list
        dictionary
    '''
    
    query = {"streamId": streamID,
            "count": count,
            "continuation": continuation}
    service = "https://cloud.feedly.com/v3/streams/contents?" + urllib.parse.urlencode(query)

    res = requests.get(url=service, headers=headers)
    return res.json()

In [20]:
feedData = get_feed_content("feed/http://feeds.feedburner.com/dkriesel-de", headers, count=100)

## Explore Feed Data

In [21]:
feedData.keys()

dict_keys(['updated', 'alternate', 'continuation', 'direction', 'id', 'items', 'title'])

In [22]:
feedContent = feedData["items"]
len(feedContent)

100

In [23]:
feedContent[0].keys()

dict_keys(['canonical', 'tags', 'categories', 'alternate', 'keywords', 'originId', 'engagement', 'summary', 'recrawled', 'title', 'visual', 'fingerprint', 'crawled', 'author', 'id', 'unread', 'published', 'origin'])

In [24]:
feedContent[0]["origin"]

{'htmlUrl': 'http://www.dkriesel.com/',
 'streamId': 'feed/http://feeds.feedburner.com/dkriesel-de',
 'title': 'D. Kriesel'}

In [25]:
# Relevant fields for each piece of content:  
# author:      author  
# canonical:   direct link to article  
# engagement:  popularity metric(?)
# id:          UID
# keywords:    list of keywords
# published:   published date
# summary:     summary dict x["summary"]["content"]
# title:       title of article
# visual: url to main visual  

In [26]:
feedContent[0]["summary"]["content"]

'<div>\n<p>\n<img alt="" width="140" src="http://www.dkriesel.com/_media/splashpics/laugh2.png?w=140&tok=3ec15d"> Die bahnbrechenden Fortschritte der SPD im Hinblick auf ihr „Projekt 5%“ zwingen mich zur Neuausrichtung einiger Plots meiner Sonntagsfragen-Liveansicht (<a rel="nofollow noopener" href="http://www.dkriesel.com/sonntagsfrage" title="http://www.dkriesel.com/sonntagsfrage" target="_blank">http://www.dkriesel.com/sonntagsfrage</a>). In den Plots, die den Fokus auf die kleinen Parteien legen (eigentlich AfD, Grüne, Linke, FDP) wird die SPD jetzt mit angezeigt, sofern sie in den Prozentbereich der kleinen Parteien vordringt. Diese Konstellation hatte ich mir beim programmieren nicht träumen lassen. Danke für eure Zuschriften. Was ich mache, wenn die SPD das Anschlussvorhaben „Projekt SPD – Sonstige Partei Deutschlands“ in Angriff nimmt und den Bereich der kleinen Parteien wieder nach unten verlässt, überlege ich mir noch. <img alt="8-)" src="http://www.dkriesel.com/lib/images/sm


### Get content for all feeds in category

In [27]:
def get_feedlist(category, headers):
    '''get all feeds for a given category

    Parameters
    ----------
    category: str
        name of the category to retrieve 
    headers: dict
        Authenication information for feedly API
        
    Returns
    -------
    list
        list of feed IDs 
    '''
    subs = get_subscriptions(headers)
    
    feeds = []
    
    for s in subs:
        if s["categories"][0]["label"] == category:
            feeds.append(s["id"])
    
    return feeds

In [28]:
feedlist = get_feedlist("Real Estate News", headers)
feedlist

['feed/http://www.immonewsfeed.de/rss/newsfeed.php',
 'feed/http://blog.immobilienscout24.de/feed/',
 'feed/http://www.ifma.org/Feeds/ifma-news',
 'feed/http://www.jamesdearsley.co.uk/feed/',
 'feed/http://www.propertymanagementinsider.com/feed',
 'feed/http://realestatetechnews.com/blog?format=RSS',
 'feed/http://grundbuchblog.de/feed/',
 'feed/http://www.jgk.be.ch/jgk/de/index/direktion/organisation/hra/_jcr_content/middlePar/meldungslisteneu_50a.rss']

In [29]:
def download_feeds(feedIDs, headers, count=10):
    '''download the contents for a list of feeds 

    Parameters
    ----------
    feedIDs: list
        list of feedIDs to retrieve
    headers: dict
        Authenication information for feedly API
    count: int
        number of items to retrieve per feed
        
    Returns
    -------
    list
        list of feed IDs 
    '''
    feedContents = {}
    
    for f in feedIDs:
        feedContents[f] = get_feed_content(f, headers, count)
        
    return feedContents        

In [30]:
feedData = download_feeds(feedlist, headers, count=10)

In [31]:
feedData['feed/http://www.immonewsfeed.de/rss/newsfeed.php'].keys()

dict_keys(['updated', 'alternate', 'continuation', 'direction', 'id', 'items', 'title'])

In [32]:
feedData.keys()

dict_keys(['feed/http://www.jamesdearsley.co.uk/feed/', 'feed/http://www.ifma.org/Feeds/ifma-news', 'feed/http://www.propertymanagementinsider.com/feed', 'feed/http://realestatetechnews.com/blog?format=RSS', 'feed/http://www.immonewsfeed.de/rss/newsfeed.php', 'feed/http://www.jgk.be.ch/jgk/de/index/direktion/organisation/hra/_jcr_content/middlePar/meldungslisteneu_50a.rss', 'feed/http://grundbuchblog.de/feed/', 'feed/http://blog.immobilienscout24.de/feed/'])

In [33]:
feedData.keys()

dict_keys(['feed/http://www.jamesdearsley.co.uk/feed/', 'feed/http://www.ifma.org/Feeds/ifma-news', 'feed/http://www.propertymanagementinsider.com/feed', 'feed/http://realestatetechnews.com/blog?format=RSS', 'feed/http://www.immonewsfeed.de/rss/newsfeed.php', 'feed/http://www.jgk.be.ch/jgk/de/index/direktion/organisation/hra/_jcr_content/middlePar/meldungslisteneu_50a.rss', 'feed/http://grundbuchblog.de/feed/', 'feed/http://blog.immobilienscout24.de/feed/'])

In [34]:
for f in feedData: print(len(feedData[f]["items"]))

10
10
10
10
10
2
10
10


In [35]:
with open("test.txt", "r+") as f:
    a = f.readlines()
    f.write("\na asdfasdf asdfasd asdfa")

In [36]:
a[2].split(" ")

['ddddxxx', 'asdfasdf', 'asdfasdf', 'asdfasdf\n']

In [37]:
a

['adsölkfjöakdslfadsölkfjöakdslfadsölkfjöakdslfadsölkfjöakdslfadsölkfjöakdslfadsölkfjöakdslfadsölkfjöakdslfadsölkfjöakdslffffffffffddddxxxddddxxx\n',
 'ddddxxx\n',
 'ddddxxx asdfasdf asdfasdf asdfasdf\n',
 ' asdfasdf asdfasd asdfa\n',
 ' asdfasdf asdfasd asdfa\n',
 'a asdfasdf asdfasd asdfa\n',
 'a asdfasdf asdfasd asdfa']

In [38]:
b = []
for i in a:
    b.append(i.split(" "))

In [72]:
feedData['feed/http://www.propertymanagementinsider.com/feed'].keys()
feedData['feed/http://www.propertymanagementinsider.com/feed']["title"]

'Property Management Insider'

In [80]:
def restructure_data(feedData):
    '''restructure the feedly data structure so each item in the dictionary is an 
    article identified by its id. information stored on feed-level is added to each article object.

        Parameters
        ----------
        feedData: dictionary
            dictionary returned by download_feeds
    
        Returns
        -------
        dictionary
            dictionary where each item is an article 
    '''

    articles = {}

    for f in feedData.keys():
        for i in feedData[f]["items"]:
            i["feedId"]    = feedData[f]["id"]
            i["feedTitle"] = feedData[f]["title"]

            articles[i["id"]] = i
    
    return articles



In [111]:
feedDataSimple = restructure_data(feedData)

In [112]:
DATA_PATH   = "D:/ZHAW/Masterarbeit/Data/feedlyData.json"
import os

In [114]:
if not os.path.isfile(DATA_PATH):
    print("newfile")
    with open(DATA_PATH, "w") as f:
        json.dump(feedDataSimple, f)
else: 
    print("existing file")
    with open(DATA_PATH, "r+") as f:
        data = json.load(f)
        print(data.keys())
        
        dataUpdated = {**data, **feedDataSimple}
        json.dump(dataUpdated, f)
        

existing file
dict_keys(['hu94UCg3vuxifOgio/b16czFOOJs9C7thvt+mklFm78=_161955eecb4:71c9f92:3c0bb656', 'tWH7WySPByo3uihEWohLz5ZTo2J828AaTnFpHiUsvDo=_166411d098e:3cb700f:a84931b0', 'bQKkXy5cmm+X3n4LN5OkkjkiiKJk2/yeic/5hqvN8A8=_1653933e802:61846c9:6f86c10b', 'jC8FXwTDciYGLOAfM8hBAYM2nuXMg9GKL32sbB5vY+o=_164953f8c11:a2eb3d4:4e0f6129', 'bQKkXy5cmm+X3n4LN5OkkjkiiKJk2/yeic/5hqvN8A8=_16631626aa7:24d5b57:f06daea', 'jC8FXwTDciYGLOAfM8hBAYM2nuXMg9GKL32sbB5vY+o=_15d99015c4f:54a9563:9493fdce', 'hu94UCg3vuxifOgio/b16czFOOJs9C7thvt+mklFm78=_161b334e3b9:9cc1993:541f3c40', 'pAll0y5reb9EOfInG4eisUIzkoZc4xCx0HyK02/6FhU=_1664554162f:416ce71:f06daea', 'hu94UCg3vuxifOgio/b16czFOOJs9C7thvt+mklFm78=_161d73f0a00:a84bf:7c9981b6', 'iUJBEZ0t16JNn5gboRic0ZqvaO+Dp0W5X1EhHXrImLg=_161cc5a5ade:bfda50a:7157a206', 'iUJBEZ0t16JNn5gboRic0ZqvaO+Dp0W5X1EhHXrImLg=_161f06002a4:260022d:923c4a60', 'bQKkXy5cmm+X3n4LN5OkkjkiiKJk2/yeic/5hqvN8A8=_1663b022794:33cb3cb:a84931b0', 'pAll0y5reb9EOfInG4eisUIzkoZc4xCx0HyK02/6FhU=_1653e15c6