In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pymysql
from sqlalchemy import create_engine

### Conection to google cloud Database (to export news dataframes)

In [2]:
driver = 'mysql+pymysql'
ip = ''
username = ''
password = ''
db = ''

cs  = f'{driver}://{username}:{password}@{ip}/{db}'
engine = create_engine(cs)

### Scraping RSS news from media

In [3]:
# list of every source of RSS 

# 'https://www.lavanguardia.com/mvc/feed/rss/home'
# 'https://www.elperiodico.com/es/rss/rss_portada.xml'
# 'https://www.ara.cat/rss/'
# 'https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml'
# 'http://ep00.epimg.net/rss/tags/ultimas_noticias.xml'

#### Generate series & dataframe with content (La Vanguardia)

In [4]:
url = 'https://www.lavanguardia.com/mvc/feed/rss/home'    

print(url, ' start')

# getting content
html = requests.get(url).content
soup = BeautifulSoup(html, "xml")
items = soup.find_all('item')

# generating series with all the news from the RSS source
title = pd.Series([item.title.text for item in items])
description = pd.Series([item.description.text for item in items])
pubdate = pd.Series([item.pubDate.text for item in items])
category = pd.Series([item.category.text for item in items])
link = pd.Series(item.link.text.split('https://')[1] for item in items)
media = pd.Series([url.split('www.')[1].split('.')[0] for item in items])

# generating the DF
rss = pd.DataFrame({'title': title, 'description': description, 'pubdate': pubdate,'category': category, 'link': link, 'media': media})

# exporting to Database in google cloud
rss.to_sql(con=engine, name='rss', if_exists='append')

print(url, ' end')

https://www.lavanguardia.com/mvc/feed/rss/home  start
https://www.lavanguardia.com/mvc/feed/rss/home  end


#### Generate series & dataframe with content (El periodico)

In [5]:
url = 'https://www.elperiodico.com/es/rss/rss_portada.xml'    

print(url, ' start')

# getting content
html = requests.get(url).content
soup = BeautifulSoup(html, "xml")
items = soup.find_all('item')

# generating series with all the news from the RSS source
title = pd.Series([item.title.text for item in items])
description = pd.Series(['' for item in items])
pubdate = pd.Series([item.pubDate.text for item in items])
category = pd.Series(['' for item in items])
link = pd.Series(item.link.text.split('https://')[1] for item in items)
media = pd.Series([url.split('www.')[1].split('.')[0] for item in items])

# generating the DF
rss = pd.DataFrame({'title': title, 'description': description, 'pubdate': pubdate,'category': category, 'link': link, 'media': media})

# exporting to Database in google cloud
rss.to_sql(con=engine, name='rss', if_exists='append')

print(url, ' end')

https://www.elperiodico.com/es/rss/rss_portada.xml  start
https://www.elperiodico.com/es/rss/rss_portada.xml  end


#### Generate series & dataframe with content (Diari ARA)

In [6]:
url = 'https://www.ara.cat/rss/'    

print(url, ' start')

# getting content
html = requests.get(url).content
soup = BeautifulSoup(html, "xml")
items = soup.find_all('item')

# generating series with all the news from the RSS source
title = pd.Series([item.title.text for item in items])
description = pd.Series(['' for item in items])
pubdate = pd.Series([item.pubDate.text for item in items])
category = pd.Series(['' for item in items])
link = pd.Series(item.link.text.split('https://')[1] for item in items)
media = pd.Series([url.split('www.')[1].split('.')[0] for item in items])

# generating the DF
rss = pd.DataFrame({'title': title, 'description': description, 'pubdate': pubdate,'category': category, 'link': link, 'media': media})

# exporting to Database in google cloud
rss.to_sql(con=engine, name='rss', if_exists='append')

print(url, ' end')

https://www.ara.cat/rss/  start
https://www.ara.cat/rss/  end


#### Generate series & dataframe with content (El Mundo)

In [7]:
url = 'https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml'

print(url, ' start')

# getting content
html = requests.get(url).content
soup = BeautifulSoup(html, "xml")
items = soup.find_all('item')

# generating series with all the news from the RSS source
title = pd.Series([item.title.text for item in items])
description = pd.Series(['' for item in items])
pubdate = pd.Series([item.pubDate.text for item in items])
category = pd.Series(['' for item in items])
link = pd.Series(item.link.text.split('https://')[1].split('?')[0] for item in items)
media = pd.Series([item.link.text.split('www.')[1].split('.')[0] for item in items])

# generating the DF
rss = pd.DataFrame({'title': title, 'description': description, 'pubdate': pubdate,'category': category, 'link': link, 'media': media})

# exporting to Database in google cloud
rss.to_sql(con=engine, name='rss', if_exists='append')

print(url, ' end')

https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml  start
https://e00-elmundo.uecdn.es/elmundo/rss/portada.xml  end


#### Generate series & dataframe with content (El Pais)

In [8]:
url = 'http://ep00.epimg.net/rss/tags/ultimas_noticias.xml'

print(url, ' start')

# getting content
html = requests.get(url).content
soup = BeautifulSoup(html, "xml")
items = soup.find_all('item')

# generating series with all the news from the RSS source
title = pd.Series([item.title.text for item in items])
description = pd.Series([item.description.text for item in items])
pubdate = pd.Series([item.pubDate.text for item in items])
category = pd.Series(['' for item in items])
link = pd.Series((item.link.text.split('https://')[1].split('html')[0] + 'html') for item in items)
media = pd.Series([item.link.text.split('https://')[1].split('.')[0] for item in items])

# generating the DF
rss = pd.DataFrame({'title': title, 'description': description, 'pubdate': pubdate,'category': category, 'link': link, 'media': media})

# exporting to Database in google cloud
rss.to_sql(con=engine, name='rss', if_exists='append')

print(url, ' end')

http://ep00.epimg.net/rss/tags/ultimas_noticias.xml  start
http://ep00.epimg.net/rss/tags/ultimas_noticias.xml  end
