In [None]:
import requests
import time
import os
from multiprocessing import Pool, cpu_count
from functools import partial
from bs4 import BeautifulSoup, SoupStrainer
from google.colab import files
from fake_useragent import UserAgent # optional, to prevent failed requests


In [None]:
def fetch_comments(comments_url):
    """
    Fetches all comments for a given article.
    Params:
    comments_url -  link to article comments, built by fetch function

    return: list of comments for a given article.
    """
    try:
        ua = UserAgent()
        headers = {
            'User-Agent': ua.random
        }
        res = requests.get(comments_url, headers=headers)
        print(res)
        if res.status_code != 200:
          return False
        soup = BeautifulSoup(res.content, 'html.parser')

        comments = soup.find_all('div', class_='comment')
        comment_texts = []

        for comment in comments:
            comment_text = comment.find('div', class_='commtext c00')
            if comment_text:
                comment_texts.append(comment_text.text.strip())

        return comment_texts

    except Exception as e:
        print(f"Error fetching comments for {comments_url}: {str(e)}")
        return []

In [None]:

def fetch(page_no, day, verbose=False):
    """
    Fetches and parses the HTML for a given day.
    Optionally, could be make it retrieve other article features, or a longer time span.

    Params:
    page_no - number of pages to be retrieves (max 20)
    day - day date to be fetched
    verbose - display info while running

    Return:
    f - text file with labeled article titles and comments

    """
    if page_no <= 0:
        raise ValueError('Number of Pages must be greater than zero')
    page_no = min(page_no, 20)
    day_str = str(day)

    if len(day_str) ==1:
        day_str = '0' + day_str
    i = str(page_no)

    if verbose:
        print('Fetching Page {}...'.format(i))
    try:
        print(f'https://news.ycombinator.com/front?day=2025-03-{day_str}&p={i}')
        ua = UserAgent()
        headers = {
            'User-Agent': ua.random
        }
        res = requests.get(f'https://news.ycombinator.com/front?day=2025-03-{day_str}&p={i}', headers=headers)
        print(res)
        if res.status_code != 200:
          return False
        only_td = SoupStrainer('td')
        soup = BeautifulSoup(res.content, 'html.parser', parse_only=only_td)
        tdtitle = soup.find_all('td', attrs={'class':'title'})
        tdmetrics = soup.find_all('td', attrs={'class':'subtext'})
        with open(os.path.join('HackerNews', 'NewsPage{}_03_{}.txt'.format(i, day)), 'w+') as f:
            f.write('-'*80)
            #f.write('\n')
            #f.write('Page {}'.format(i))
            tdtitle = soup.find_all('td', attrs={'class':'title'})
            tdrank = soup.find_all('td', attrs={'class':'title', 'align':'right'})
            tdtitleonly = [t for t in tdtitle if t not in tdrank]
            tdmetrics = soup.find_all('td', attrs={'class':'subtext'})
            tdt = tdtitleonly
            tdr = tdrank
            tdm = tdmetrics
            num_iter = min(len(tdr), len(tdt))

            for idx in range(num_iter):
                #f.write('\n' + '-' * 80 + '\n')
                rank = tdr[idx].find('span', attrs={'class': 'rank'})
                titl = tdt[idx].find('a')
                if titl:
                    url = titl['href']
                    if url and not url.startswith('https'):
                        url = 'https://news.ycombinator.com/' + url
                else:
                    url = 'No URL found for this article'

                site = tdt[idx].find('span', attrs={'class': 'sitestr'})
                score = tdm[idx].find('span', attrs={'class': 'score'})
                time = tdm[idx].find('span', attrs={'class': 'age'})
                author = tdm[idx].find('a', attrs={'class': 'hnuser'})

                comments_link = tdm[idx].find('a', string=lambda text: text and 'comments' in text)
                if comments_link:
                    comments_url = 'https://news.ycombinator.com/' + comments_link['href']
                else:
                    comments_url = 'No comments link found'


                # Write the details to the file
                #f.write('\nArticle Number: ' + (rank.text.replace('.', '') if rank else 'Could not get article number'))
                f.write('\nArticle Title: ' + (titl.text if titl else 'Could not get article title'))
                #f.write('\nSource Website: ' + (site.text if site else 'https://news.ycombinator.com'))
                #f.write('\nSource URL: ' + url)
                #f.write('\nArticle Author: ' + (author.text if author else 'Could not get article author'))
                #f.write('\nArticle Score: ' + (score.text if score else 'Not Scored'))
                #f.write('\nPosted: ' + (time.text if time else 'Could not find when the article was posted'))
                if comments_url != 'No comments link found':
                    #print(comments_url)
                    comments = fetch_comments(comments_url)
                    if comments:
                        f.write('\nComments:\n')
                        for comment in comments:
                            f.write(comment + '\n')
                    else:
                        f.write('\nNo comments found.\n')
                #f.write('\n' + '-' * 80 + '\n')


      except (requests.ConnectionError, requests.packages.urllib3.exceptions.ConnectionError) as e:
          print('Connection Failed for page {}'.format(i))
      except requests.RequestException as e:
          print("Some ambiguous Request Exception occurred. The exception is "+str(e))
      return f



In [None]:
def calling_fetch(day1, day2):
  """
  Calls the fetch function to get data across multiple days. Recommended to do it on a daily basis to prevent requests from craching.

  Params:
  - day1, day2 in desired date range

  Return:
   - f: text file with labeled article titles and comments

  """
  for d in range(day1, day2):
      print(f'Day {d}')
      try:
          pages = 3
          succ = True
          if pages > 20:
              print('A maximum of only 20 pages can be fetched')
          pages = min(pages, 20)
          for page_no in range(1, pages + 1):
              if not succ:
                break
              f =  fetch(page_no, d, verbose)
              time.sleep(2)
          #break
      except ValueError as e:
          print('\nInvalid input, probably not a positive integer\n')
          continue
  return f

Day 12
Fetching Page 1...
https://news.ycombinator.com/front?day=2025-03-12&p=1
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [403]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [403]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
<Response [504]>
Fetching Page 2...
https://news.ycombinator.com/front?day=2025-03-12&p=2
<Response [504]>


In [None]:
def process_articles(input_file, output_file):
    """
    Convert the fetched text into desired output of one article per line in text file.
    Params:
    input_file - output from previous function

    Return: text file for the day
    """

    count = 0
    with open(input_file, 'r') as f:
        lines = f.readlines()

    article_title = ""
    article_score = 0
    article_comments = []
    processed_articles = []

    idx = 0
    while idx < len(lines):
        line = lines[idx]
        if line.startswith('Article Title:'):
            if article_title:
                processed_articles.append(f"{article_title} {''.join(article_comments)}\n")
                print(article_title, idx, article_comments)
                count+=1
            article_title = line.strip().replace('Article Title: ', '')
            article_comments = []

        elif line.startswith('Comments:'):
            idx += 1
            while idx < len(lines) and not lines[idx].startswith('Article Title:'):
                #print(idx, lines[idx])
                article_comments.append(lines[idx].strip())
                idx += 1
            idx -= 1

        idx += 1

    if article_title:
        processed_articles.append(f"{article_title} {''.join(article_comments)}\n")
        count+=1

    output_file.writelines(processed_articles)

    #print(f"Processed articles with 100 or more points have been written to {output_file}")
    #print(count)

def process_articles_combined(f, day1, day2, output_file):
    """
    Converts multiple days of individual files into a single one.

    Params: day1, day2

    Return: text file with data for each file in a single line.
    """
    with open(output_file, 'w+') as f_output:
      for d in range(12, 13):
        for page_no in range(1, 4):  # Loop through pages 1 to 2
            input_file = f
            print(page_no, input_file)
            try:
                process_articles(input_file, f_output)
            except FileNotFoundError:
                print(f'File {input_file} not found, skipping.')
                continue





1 HackerNews/NewsPage1_03_12.txt
The DuckDB Local UI 1102 ["This looks pretty great. The UI looked fantastic, and the post mentioned that it was open source. However what's open source appears to be the DuckDB extension, which forwards the requests to a remote URL. I've not been able to find the code for the actual UI.Is the actual UI open source, or is that something MotherDuck is allowing to be used by this while remaining proprietary? Right now it doesn't appear like this would work without an internet connection.", 'Yeah, this is really concerning. The handwaving around "keeping the ui up to date" by hosting it on ui.duckdb.org instead of embedding it doesn\'t taste great to me.At least it\'s hosted on duckdb.org and not mother duck, but I really would expect to see that source somewhere. Disappointing unless I\'ve missed it.Breadcrumbs in the extension src: https://github.com/duckdb/duckdb-ui/blob/963e0e4d4c6f84b2536...', "Yes. So confirmation from Jeff Raymakers, a software engin

## Running the Scrapper




In [None]:
if not os.path.exists(os.path.join(os.getcwd(), 'HackerNews')):
  os.makedirs(os.path.join(os.getcwd(), 'HackerNews'))

day = 1
day2 = 2

f = callingfetch(day1, day2)
output_file = 'HackerNews/April_day12_pages1_3.txt'
process_articles_combined(f, day1, day2, output_file)
files.download(output_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>