# Working with RSS Feeds Lab

Complete the following set of exercises to solidify your knowledge of parsing RSS feeds and extracting information from them.

In [1]:
import feedparser as fp

### 1. Use feedparser to parse the following RSS feed URL.

In [2]:
url = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [3]:
rss = fp.parse(url)
rss

{'feed': {'title': 'Radar',
  'title_detail': {'type': 'text/plain',
   'language': None,
   'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
   'value': 'Radar'},
  'links': [{'rel': 'alternate',
    'type': 'text/html',
    'href': 'https://www.oreilly.com/radar'},
   {'rel': 'self',
    'type': 'application/rss+xml',
    'href': 'http://feeds.feedburner.com/oreilly/radar/atom'},
   {'rel': 'hub',
    'href': 'http://pubsubhubbub.appspot.com/',
    'type': 'text/html'}],
  'link': 'https://www.oreilly.com/radar',
  'subtitle': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology',
  'subtitle_detail': {'type': 'text/html',
   'language': None,
   'base': 'http://feeds.feedburner.com/oreilly/radar/atom',
   'value': 'Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology'},
  'updated': 'Fri, 06 Dec 2019 11:45:13 -0500',
  'updated_parsed': time.struct_time(tm_year=2019, tm_mon=12, tm_m

### 2. Obtain a list of components (keys) that are available for this feed.

In [4]:
list(rss.keys())

['feed',
 'entries',
 'bozo',
 'headers',
 'etag',
 'updated',
 'updated_parsed',
 'href',
 'status',
 'encoding',
 'version',
 'namespaces']

### 3. Obtain a list of components (keys) that are available for the *feed* component of this RSS feed.

In [5]:
list(rss["feed"])

['title',
 'title_detail',
 'links',
 'link',
 'subtitle',
 'subtitle_detail',
 'updated',
 'updated_parsed',
 'language',
 'sy_updateperiod',
 'sy_updatefrequency',
 'generator_detail',
 'generator',
 'feedburner_info',
 'geo_lat',
 'geo_long',
 'feedburner_emailserviceid',
 'feedburner_feedburnerhostname']

### 4. Extract and print the feed title, subtitle, author, and link.

In [6]:
print(rss['feed']['title'])
print(rss['feed']['subtitle'])
# print(rss['feed']['author']) I cannot find the feed author. There are entries authors though. 
print(rss['feed']['link'])

Radar
Now, next, and beyond: Tracking need-to-know trends at the intersection of business and technology
https://www.oreilly.com/radar


### 5. Count the number of entries that are contained in this RSS feed.

In [7]:
print(len(rss["entries"]))

18


### 6. Obtain a list of components (keys) available for an entry.

*Hint: Remember to index first before requesting the keys*

In [8]:
list(rss["entries"][0].keys())

['title',
 'title_detail',
 'links',
 'link',
 'comments',
 'published',
 'published_parsed',
 'authors',
 'author',
 'author_detail',
 'tags',
 'id',
 'guidislink',
 'summary',
 'summary_detail',
 'content',
 'wfw_commentrss',
 'slash_comments',
 'feedburner_origlink']

### 7. Extract a list of entry titles.

In [9]:
x = 0
entry_titles = []
while x < len(rss["entries"]):
    entry_titles.append(rss["entries"][x]["title"])
    x += 1
    
entry_titles


['Four short links: 6 December 2019',
 'Radar trends to watch: December 2019',
 'Four short links: 5 December 2019',
 'Four short links: 4 December 2019',
 'Use your people as competitive advantage',
 'Four short links: 3 December 2019',
 'A 5G future',
 'Four short links: 2 December 2019',
 'Four short links: 29 November 2019',
 'Four short links: 28 November 2019',
 'Four short links: 27 November 2019',
 'Moving AI and ML from research into production',
 'Four short links: 26 November 2019',
 'Four short links: 25 November 2019',
 'Four short links: 22 November 2019',
 'Why you should care about robotic process automation',
 'Unraveling the mystery of code',
 'Four short links: 21 November 2019']

### 8. Calculate the percentage of "Four short links" entry titles.

In [10]:
import re
        
FScount = [1 for i in entry_titles if len(re.findall("Four short links", i)) > 0]
Percentage = round(sum(FScount)/len(rss["entries"]) * 100,2)
print(f"Total percentage of Four Short links is {Percentage}%")

# is there a shorter or efficient way to do this?

Total percentage of Four Short links is 66.67%


### 9. Create a Pandas data frame from the feed's entries.

In [11]:
import pandas as pd

In [12]:
posts = []
for post in rss["entries"]:
    #for element in list(rss["entries"][0].keys()):
    posts.append([post["title"], post["summary"], \
                  post["links"], post["link"], post["author"]])
    
posts

df = pd.DataFrame(posts, columns=["title", "summary", "links", "link", "author"])

df

 

Unnamed: 0,title,summary,links,link,author
0,Four short links: 6 December 2019,Declarative Assembly of Web Applications From ...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Nat Torkington
1,Radar trends to watch: December 2019,Privacy and security trends DNS over HTTPS is ...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Mike Loukides
2,Four short links: 5 December 2019,Rediscovered Incomplete Infocom Text Adventure...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Nat Torkington
3,Four short links: 4 December 2019,The Complexity Explorer &#8212; online courses...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Nat Torkington
4,Use your people as competitive advantage,"In a fast-paced digital world, it is tempting ...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Pamela Rucker
5,Four short links: 3 December 2019,"Oxide.computer &#8212; a new hardware company,...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Nat Torkington
6,A 5G future,"For the past year, 5G cell technology has gene...","[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Mike Loukides
7,Four short links: 2 December 2019,Two Years at Dropbox &#8212; a lot of wisdom a...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Nat Torkington
8,Four short links: 29 November 2019,A Visual Guide to Using BERT for the First Tim...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Nat Torkington
9,Four short links: 28 November 2019,Raspberry Pi Recovery Kit &#8212; Pi for Prepp...,"[{'rel': 'alternate', 'type': 'text/html', 'hr...",http://feedproxy.google.com/~r/oreilly/radar/a...,Nat Torkington


### 10. Count the number of entries per author and sort them in descending order.

In [13]:
df.groupby("author").count().sort_values("title", ascending=False)

Unnamed: 0_level_0,title,summary,links,link
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nat Torkington,12,12,12,12
Jenn Webb,2,2,2,2
Mike Loukides,2,2,2,2
Pamela Rucker,1,1,1,1
"Sunil Ranka, Roger Magoulas and Steve Swoyer",1,1,1,1


### 11. Add a new column to the data frame that contains the length (number of characters) of each entry title. Return a data frame that contains the title, author, and title length of each entry in descending order (longest title length at the top).

In [14]:
df2 = df[["title", "author"]]
df2["title length"] = df2["title"].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
df2.sort_values("title length", ascending=False)

Unnamed: 0,title,author,title length
15,Why you should care about robotic process auto...,"Sunil Ranka, Roger Magoulas and Steve Swoyer",52
11,Moving AI and ML from research into production,Jenn Webb,46
4,Use your people as competitive advantage,Pamela Rucker,40
1,Radar trends to watch: December 2019,Mike Loukides,36
9,Four short links: 28 November 2019,Nat Torkington,34
14,Four short links: 22 November 2019,Nat Torkington,34
13,Four short links: 25 November 2019,Nat Torkington,34
12,Four short links: 26 November 2019,Nat Torkington,34
10,Four short links: 27 November 2019,Nat Torkington,34
17,Four short links: 21 November 2019,Nat Torkington,34


### 12. Create a list of entry titles whose summary includes the phrase "machine learning."

In [16]:
import re
df.loc[df["summary"].str.contains("machine learning", flags=re.I, regex=True)]
tmp = df.loc[df["summary"].str.contains("machine learning", flags=re.I, regex=True)]
ml = list(tmp["title"])
ml
# Im not sure whether this is the best approach. Maybe there is a shorter way to get the same result.

['Four short links: 28 November 2019',
 'Moving AI and ML from research into production']