In [49]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }

def get_feed_articles_df(feedname,url):
    """
    Get article titles and create a DataFrame.
    
    Args:
        feedname (str): Name of the feed.
        url (str): URL of the XML feed.
    
    Returns:
        pandas.DataFrame: DataFrame containing article titles and feed name.
    """
    try:
        result = requests.get(url, headers=headers)

        soup = BeautifulSoup(result.text, "xml")        
        article_urls = [i.text for i in soup.findAll('link')]

        #The verge has the links in the id tag, if the list is empty with the link tag, try the id tag
        if len([item for item in article_urls if bool(item)])  == 0: 
            article_urls = [i.text for i in soup.findAll('id')]      
        
       
        #Parse it as html to get the links correctly, other wise In some websites, <media:title> is also returned as a link
        soup = BeautifulSoup(result.text, "html.parser")
        article_titles = [i.text for i in soup.findAll('title')]      
        
        df = pd.DataFrame({'Article_title': article_titles, 'Article_URL': article_urls[-len(article_titles):], 'Feedname': feedname})
        
        #Remove homepage from url list and empty url rows
        homepage = url.split('.com')[0] + '.com/'
        df = df[(df['Article_URL'] != homepage) & (df['Article_URL'] != '') ]        
        
        # Drop duplicate URLs
        df = df.drop_duplicates(subset=['Article_URL'], keep='first')

        return df

    except Exception as e:
        print("Error getting feed: ", e)
        return pd.DataFrame()

In [50]:
import sqlite3

db_name = 'RssFeeds.db'

def get_connection():
    """
    Establish a connection to a SQLite database.
    
    Args:
        db_name (str): Name of the SQLite database file.
    
    Returns:
        sqlite3.Connection: Connection object to the SQLite database.
    """
    try:
        con = sqlite3.connect(db_name)
        return con
    except sqlite3.Error as e:
        print("Error connecting to database: ", e)
        return None

def create_db():
    """
    Create a new SQLite database and execute the given query to create tables.
    
    Args:
        db_name (str): Name of the SQLite database file.
        query (str): SQL query to create tables in the database.
    
    Returns:
        sqlite3.Connection: Connection object to the SQLite database.
    """
    
    query = "CREATE TABLE IF NOT EXISTS FEEDS( Feedname, Article_title UNIQUE, Article_URL, Duplicate, Date, Summary)"    

    con = get_connection()
    
    if con is None:
        return None
    
    try:
        cur = con.cursor()
        cur.execute(query)
        con.commit()
        #return con
    except sqlite3.Error as e:
        print("Error creating database: ", e)
        con.close()
        #return None
    
    print("DB created successfully")

def insert_to_db(data, query):
    """
    Insert data into SQLite database using executemany.
    
    Args:
        con (sqlite3.Connection): Connection object to the SQLite database.
        data (list of tuples): Data to be inserted into the database.
        query (str): SQL query for insertion.
    
    Returns:
        None
    """
    con = get_connection()

    if not data:
        print("No data to insert.")
        return
    
    try:
        cur = con.cursor()
        cur.executemany(query, data)
        con.commit()
    except sqlite3.Error as e:
        print("Error inserting data into database: ", e)
        con.rollback()

def insert_to_FEEDS(data):
    con = get_connection()

    if len(data) == 0:
        print("No data to insert.")
        return
    
    try:
        cur = con.cursor()

        query = "INSERT Or REPLACE INTO FEEDS(Article_title,Article_URL,Feedname) VALUES (?, ?, ?)"

        cur.executemany(query, data)
        con.commit()
        con.close()
    except sqlite3.Error as e:
        print("Error inserting data into database: ", e)
        con.rollback()
        con.close()

def delete_from_db(tablename):
    """
    Delete data from SQLite database.
    
    Args:
        con (sqlite3.Connection): Connection object to the SQLite database.
        query (str): SQL query for deletion.
    
    Returns:
        None
    """
    con = get_connection()
    query = "DROP TABLE IF EXISTS " + tablename
    try:
        cur = con.cursor()
        cur.execute(query)
        con.commit()
    except sqlite3.Error as e:
        print("Error deleting data from database: ", e)
        con.rollback()


def query_db( query):
    """
    Execute a SQL query and fetch results from SQLite database.
    
    Args:
        con (sqlite3.Connection): Connection object to the SQLite database.
        query (str): SQL query to be executed.
    
    Returns:
        list of tuples: Result set fetched from the database.
    """
    con = get_connection()

    try:
        cur = con.cursor()
        cur.execute(query)
        return cur.fetchall()
    except sqlite3.Error as e:
        print("Error executing query: ", e)
        return []

In [51]:
feedlist={'Engadget':'https://www.engadget.com/rss.xml', 
          'The Verge':'https://www.theverge.com/rss/index.xml',
          'Techcrunch':'https://techcrunch.com/feed/',
          'Ars Technica':'https://feeds.arstechnica.com/arstechnica/index',
          'Jalopnik':'https://jalopnik.com/rss'}  

def refresh_feeds():
    for feed in feedlist:
        print('Getting and inserting data for ', feed)
        df = get_feed_articles_df(feed,feedlist[feed])  
        insert_to_FEEDS(df.values)

In [52]:
create_db()

DB created successfully


In [53]:
refresh_feeds()

Getting and inserting data for  Engadget


  k = self.parse_starttag(i)


Getting and inserting data for  The Verge
Getting and inserting data for  Techcrunch
Getting and inserting data for  Ars Technica
Getting and inserting data for  Jalopnik


In [54]:
#query_db("Select * from FEEDS")
pd.read_sql("Select * from FEEDS", get_connection())

Unnamed: 0,Feedname,Article_title,Article_URL,Duplicate,Date,Summary
0,Engadget,Marvelâs making an âinteractive storyâ b...,https://www.engadget.com/marvels-making-an-int...,,,
1,Engadget,"Ugh, Max subscription prices might be going up...",https://www.engadget.com/ugh-max-subscription-...,,,
2,Engadget,"Oh no, I think I want an iPad Pro now",https://www.engadget.com/oh-no-i-think-i-want-...,,,
3,Engadget,Nintendo just revealed a NES speedrunning coll...,https://www.engadget.com/nintendo-just-reveale...,,,
4,Engadget,The Google Pixel Watch 2 has never been cheaper,https://www.engadget.com/the-google-pixel-watc...,,,
...,...,...,...,...,...,...
148,Jalopnik,The 2024 BMW F900GS Has No Worlds Left To Conquer,https://jalopnik.com/the-2024-bmw-f900gs-has-n...,,,
149,Jalopnik,"The Incredible Tale Of 1907's 8,000-Mile Race ...",https://jalopnik.com/the-incredible-tale-of-19...,,,
150,Jalopnik,Bear Drags Crash Victim From Wreckage After Ca...,https://jalopnik.com/bear-drags-crash-victim-f...,,,
151,Jalopnik,"At $5,000, Is This 1981 Mercury Marquis A Libe...",https://jalopnik.com/at-5-000-is-this-1981-mer...,,,
