# Create External Posts from RSS Feeds
Script by [J. Nathan Matias](https://natematias.com) 2018, made available under an [MIT License](https://opensource.org/licenses/MIT).

This script loads data from external RSS feeds, compares it to files in \_external_posts, and creates new files for any non-blacklisted URLs appearing in the feed. This allows a jekyll website to include posts from an RSS feed in the site layout.

In [269]:
import feedparser, frontmatter, yaml, glob, os, re, slugify
from collections import Counter, defaultdict
from dateutil import parser

In [270]:
feeds = {
    "MIT": "https://civic.mit.edu/author/natematias/feed",
    "Princeton": "https://freedom-to-tinker.com/author/nmatias/feed/",
    "Medium": "https://medium.com/feed/@natematias",
    "ESN": "https://blog.emergingscholars.org/author/j-nathan-matias/feed/",
    "TheAtlantic": "https://www.theatlantic.com/feed/author/j-nathan-matias/",
    "PBS MediaShift": "http://mediashift.org/author/natematias/feed/",
    "Global Voices": "https://summit2012.globalvoices.org/author/nmatias/feed/"
}

### Load All Markdown Files in \_external_posts
Iterate through \_external_posts and extract the metadata so we can avoid creating duplicates. This code should never overwrite an existing markdown data, since we want to be able to customize individual markdown files after they are initially created.

In [274]:
external_posts = {}
current_filenames = []
for filename in glob.glob(os.path.join("_external_posts", "*")):
    current_filenames.append(filename)
    with open(filename, "r") as f:
        contents = f.read()
        if(len(contents)>0):
            md = frontmatter.loads(contents)
            external_posts[md.metadata['link']] = md.metadata

### Load and Parse  RSS feeds

In [275]:
def slugify(value):
    """
    Normalizes string, converts to lowercase, removes non-alpha characters,
    and converts spaces to hyphens. modified from the Django codebase
    """
    import unicodedata
    value = re.sub('[^\w\s-]', '', value).strip().lower()
    value = re.sub('[-\s]+', '-', value)
    return value
    
def create_markdown_from_entry(entry):
    content = ''
    post_date = ''
    image = ''
    if('summary' in entry.keys()):
        ## strip HTML tags from the content
        summary = re.sub('<[^<]+?>', '', entry['summary']) 
        ## find all content up to the last full stop        
        r_result = re.search('(.*?)(\.|\?)', summary)
        content = r_result.group(0).replace("Read more", "")
    if('published' in entry.keys()):
        post_date = str(parser.parse(entry['published']).date())
    ## TODO: FETCH IMAGE    
    post = frontmatter.Post(content = content)
    post.metadata['title']     = entry['title']
    post.metadata['date']      = post_date
    post.metadata['publisher'] = publisher_key
    post.metadata['link']      = entry['link']
    filename_title = (" ".join(entry['title'].split(" ")[0:5])).lower()
    post.metadata['filename']  = post_date + "-" + slugify(filename_title)
    return post



In [276]:
## CREATE A DICT OF LISTS OF POSTS BY PUBLISHER
## WHERE POSTS ARE READY FOR MARKDOWN OUTPUT
feed_articles = defaultdict(list)

for publisher_key in feeds.keys():
    d = feedparser.parse(feeds[publisher_key])
    for entry in d['entries']:
        feed_articles[publisher_key].append(create_markdown_from_entry(entry))

### Output Posts to Markdown Where Posts Don't Already Exist

In [277]:
files_written = []
files_omitted = []
for key, posts in feed_articles.items():
    for post in posts:
        if(post.metadata['link'] not in external_posts.keys() and
           post.metadata['filename'] not in current_filenames):
            with open(os.path.join("_external_posts", post.metadata['filename'] + ".md"), "w+") as f:
                f.write(frontmatter.dumps(post))
                files_written.append(post.metadata['filename'])
        else:
            files_omitted.append(post.metadata['filename'])

print("Wrote {0} new files".format(len(files_written)))
print("Omitted to write {0} existing files".format(len(files_omitted)))

Wrote 20 new files
Omitted to write 59 existing files
