In [19]:
from read_feeds import get_feed_articles

In [244]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

headers = {
            'User-Agent': 'Chrome/124.0.0.0'
        }

def get_feed_articles_df(feedname,url):
    """
    Get article titles and create a DataFrame.
    
    Args:
        feedname (str): Name of the feed.
        url (str): URL of the XML feed.
    
    Returns:
        pandas.DataFrame: DataFrame containing article titles and feed name.
    """
    try:
        result = requests.get(url, headers=headers)

        soup = BeautifulSoup(result.text, "xml")        
        article_urls = [i.text for i in soup.findAll('link')]

        #The verge has the links in the id tag, if the list is empty with the link tag, try the id tag
        if len([item for item in article_urls if bool(item)])  == 0: 
            article_urls = [i.text for i in soup.findAll('id')]      
        
       
        #Parse it as html to get the links correctly, other wise In some websites, <media:title> is also returned as a link
        soup = BeautifulSoup(result.text, "html.parser")
        article_titles = [i.text for i in soup.findAll('title')]      

        print(article_urls)

        df = pd.DataFrame({'article_title': article_titles, 'url': article_urls[-len(article_titles):], 'feedname': feedname})
        
        #Remove homepage from url list and empty url rows
        homepage = url.split('.com')[0] + '.com/'
        df = df[(df['url'] != homepage) & (df['url'] != '') ]        
        
        # Drop duplicate URLs
        df = df.drop_duplicates(subset=['url'], keep='first')

        return df

    except Exception as e:
        print("Error getting feed: ", e)
        return pd.DataFrame()

In [245]:
#get_feed_articles_df('Engadget','https://www.engadget.com/rss.xml')
#get_feed_articles_df('The Verge','https://www.theverge.com/rss/index.xml')
#get_feed_articles_df('Techcrunch','https://techcrunch.com/feed/')
#get_feed_articles_df('Ars Technica','https://feeds.arstechnica.com/arstechnica/index')
get_feed_articles_df('Jalopnik','https://jalopnik.com/rss')

['https://jalopnik.com', '', 'https://jalopnik.com/real-estate-developer-allegedly-drives-car-into-crowd-o-1851462303', 'https://jalopnik.com/lighter-more-powerful-2025-bmw-m4-cs-is-the-kind-of-ov-1851461829', 'https://jalopnik.com/more-americans-watched-the-miami-grand-prix-than-any-f1-1851461255', 'https://jalopnik.com/you-should-at-least-bid-on-this-special-mclaren-senna-w-1851461844', 'https://jalopnik.com/car-seat-foam-could-expose-you-to-carcinogens-by-the-w-1851461709', 'https://jalopnik.com/the-best-hybrid-suvs-for-less-than-45-000-according-to-1851461347', 'https://jalopnik.com/the-faa-is-investigating-boeings-787-now-too-1851461323', 'https://jalopnik.com/tesla-is-staking-its-future-on-this-painfully-slow-robo-1851461288', 'https://jalopnik.com/jennifer-tillys-first-car-was-a-beat-up-1969-ford-musta-1851460707', 'https://jalopnik.com/this-minis-vs-mustangs-race-is-as-good-as-vintage-raci-1851460911', 'https://jalopnik.com/it-takes-11-miles-of-thread-and-2-2-million-stitches-t

  k = self.parse_starttag(i)


Unnamed: 0,article_title,url,feedname
1,Real Estate Developer Allegedly Drives Car Int...,https://jalopnik.com/real-estate-developer-all...,Jalopnik
2,"Lighter, More Powerful 2025 BMW M4 CS Is The K...",https://jalopnik.com/lighter-more-powerful-202...,Jalopnik
3,More Americans Watched The Miami Grand Prix Th...,https://jalopnik.com/more-americans-watched-th...,Jalopnik
4,You Should At Least Bid On This Special McLare...,https://jalopnik.com/you-should-at-least-bid-o...,Jalopnik
5,"Car Seat Foam Could Expose You To Carcinogens,...",https://jalopnik.com/car-seat-foam-could-expos...,Jalopnik
6,"The Best Hybrid SUVs For Less Than $45,000 Acc...",https://jalopnik.com/the-best-hybrid-suvs-for-...,Jalopnik
7,The FAA Is Investigating Boeing's 787 Now Too,https://jalopnik.com/the-faa-is-investigating-...,Jalopnik
8,Tesla Is Staking Its Future On This Painfully ...,https://jalopnik.com/tesla-is-staking-its-futu...,Jalopnik
9,Jennifer Tilly's First Car Was a Beat-Up 1969 ...,https://jalopnik.com/jennifer-tillys-first-car...,Jalopnik
10,This Minis Vs. Mustangs Race Is As Good As Vin...,https://jalopnik.com/this-minis-vs-mustangs-ra...,Jalopnik


Unnamed: 0,article_title,feedname
0,[You Should At Least Bid On This Special McLar...,Jalopnik
1,[Car Seat Foam Could Expose You To Carcinogens...,Jalopnik
2,"[The Best Hybrid SUVs For Less Than $45,000 Ac...",Jalopnik
3,[The FAA Is Investigating Boeing's 787 Now Too],Jalopnik
4,[Tesla Is Staking Its Future On This Painfully...,Jalopnik
5,[Jennifer Tilly's First Car Was a Beat-Up 1969...,Jalopnik
6,[This Minis Vs. Mustangs Race Is As Good As Vi...,Jalopnik
7,[It Takes 11 Miles Of Thread And 2.2 Million S...,Jalopnik
8,[Guy Who Nearly Lost A Finger In Tesla Cybertr...,Jalopnik
9,[Qantas Will Pay $66 Million For Ghosting Over...,Jalopnik


In [17]:
df = pd.DataFrame(columns=['feedname,article_title, duplicate'])
df.head()

Unnamed: 0,"feedname,article_title, duplicate"


In [21]:
rss_url = 'https://www.theverge.com/rss/index.xml'
df['article_title'] = get_feed_articles(rss_url)
df['duplicate'] = 0
df['feedname'] = 'The Verge'

df.head()

Unnamed: 0,"feedname,article_title, duplicate",article_title,duplicate,feedname
0,,"[Compression-mounted laptop RAM is fast, effic...",0,The Verge
1,,[This is the Sonos Roam 2 portable speaker],0,The Verge
2,,[Walmart’s Google-powered streaming box and sm...,0,The Verge
3,,[How to set up eSIM on a new iPhone],0,The Verge
4,,[Amazon adds 50 electric trucks to its deliver...,0,The Verge


In [8]:
import sqlite3

def get_connection(db_name):
    con = sqlite3.connect(db_name)
    
    return con

def create_db(db_name,query):
    con = get_connection(db_name)
    cur = con.cursor()

    try:
        cur.execute(query)   
    except Exception as e:
        con.close()
        raise e

    return con

def insert_to_db(con,data,query):
    #con = get_connection(db_name)
    
    cur = con.cursor()
    for item in data:
        try:
            cur.executemany(query, data)
        except Exception as e:
            con.commit()
            raise e
        
    con.commit()    

def delete_from_db(con,query):
    cur = con.cursor()
    cur.execute(query)

def query_db(con,query):
    cur = con.cursor()
    res = cur.execute(query)
    return res.fetchall()

In [2]:
con = create_db('movies.db','CREATE TABLE MOVIE(Title UNIQUE,year, score)')

In [3]:
query_db(con,'select * from MOVIE')

[]

In [5]:
data = [
    ("Monty Python Live at the Hollywood Bowl", 1982, 7.9),
    ("Monty Python's The Meaning of Life", 1983, 7.5),
    ("Monty Python's Life of Brian", 1979, 8.0),
]

insert_to_db(con,data,'INSERT INTO movie VALUES(?, ?, ?)')

query_db(con,'select * from movie')

IntegrityError: UNIQUE constraint failed: MOVIE.Title

In [10]:
import pandas as pd

df = pd.DataFrame(data=data,columns=['Title','year','score'])
df.head()

Unnamed: 0,Title,year,score
0,Monty Python Live at the Hollywood Bowl,1982,7.9
1,Monty Python's The Meaning of Life,1983,7.5
2,Monty Python's Life of Brian,1979,8.0


In [13]:
df.to_sql('MOVIE',con,if_exists='append', index=False)

3

In [14]:
query_db(con,'select * from movie')

[('Monty Python Live at the Hollywood Bowl', 1982, 7.9),
 ("Monty Python's The Meaning of Life", 1983, 7.5),
 ("Monty Python's Life of Brian", 1979, 8.0)]

In [12]:
delete_from_db(con,"DELETE FROM MOVIE WHERE title like 'Monty%'")
query_db(con,'select * from movie')

[]

In [None]:

con = create_db('feeds.db','CREATE TABLE IF NOT EXISTS feed_articles(feedname,article_title UNIQUE, duplicate)')

query_db(con,'select * from feed_articles')


In [None]:
con.close()

In [125]:
from bs4 import BeautifulSoup

# Your XML content
xml_content = """
<root>
    <title>This is the main title</title>
    <media:title>This is a media title</media:title>
    <link rel="stylesheet" type="text/css" href="styles.css" />
    <title>This is a secondary title</title>
    <link rel="icon" href="favicon.ico" type="image/x-icon" />
</root>
"""

# Parse the XML content
soup = BeautifulSoup(xml_content, 'xml')

# Find all occurrences of <title> and <link> tags
tags = soup.find_all(['title', 'link'])

# Print the content of each <title> and <link> tag
for tag in tags:
    print(tag.text)

This is the main title
This is a media title

This is a secondary title

