In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from datetime import datetime
from dateutil import parser
import re
from collections import Counter
import os

In [None]:
class BitcoinTalkScraper:

  def __init__(self, url):
    # Initialize the scraper with the URL to scrape from
    self.url = url
    # Dictionary to store scraped data (date, headline, link, description)
    self.news_dict = {"date": [],
                      "headline": [],
                      "headline_link": [],
                      "description": []
              }
    # Placeholder attributes for headlines, descriptions, and dates
    self.headline = ""
    self.description = ""
    self.link = ""
    self.date = None


  def parse_date(self,date_str):
    """
    Attempt to parse the date string in different formats using dateutil.
    First try without assuming the day-first format, then retry with day-first.
    Return the date in YYYY-MM-DD format, or None if parsing fails.
    """
    try:
        # Use dateutil parser to parse the date
        date = parser.parse(date_str, dayfirst=False, yearfirst=True)
        return date.strftime("%Y-%m-%d")
    except ValueError:
        try:
            # Try parsing with day first if the previous attempt failed
            date = parser.parse(date_str, dayfirst=True, yearfirst=False)
            return date.strftime("%Y-%m-%d")
        except ValueError:
            return None

  def extract_date(self):
    """
    Extract a date from the headline using regular expressions.
    The method searches for date patterns and separates the date from the headline text.
    If no date is found, it marks the date as 'Null' or leaves the headline unchanged.
    """
    #regex pattern for extracting different date formats from the headline
    date_pattern = re.compile(r'((\[(\d{4}\-\d{1,2}\-\d{1,2)}\])|(\d{4}\-\d{1,2}\-\d{1,2})|(\d{4}\/\d{1,2}\/\d{1,2})|(\d{1,2}\-\d{1,2}\-\d{4})|(\d{1,2}\/\d{1,2}\/\d{4})|(\d{4}.\d{1,2}\.\d{1,2})|(\d{1,2}.\d{1,2}\.\d{4}))')
    #regex pattern for handling headlines without date (only text)
    text_without_date = re.compile(r'^[a-zA-Z]+')
    #checking if date is present with date_pattern
    date_match = date_pattern.search(self.headline)
    #checking for text with only alphabets
    alpha_match = text_without_date.search(self.headline)
    if date_match: #If date_pattern is matched, extracting and splitting date and headline
        date = self.parse_date(date_match.group()) #Extract date
        text = self.headline[date_match.end()+1:] #Extarct rest of the string as headline text
        self.headline = text
        self.date = date
        # date_headline.append([date,text]) #Appending data as a list of date and headline text




    elif alpha_match: #If no date_pattern is matched, date is set as 'Null' and appended to date_headline list
        self.date = "Null"
        # text = self.headline
        # date_headline.append([date,text])
    else:
        self.date = self.headline
        # count+=1


  def scrape_data(self):
    """
    Scrapes headlines, dates, and descriptions from the given URL.
    Extracts data from the page, processes it, and stores it in a DataFrame.
    """
    # Make an get() request to the specified URL
    headlines_res = requests.get(url=self.url)
    time.sleep(1) # Wait to avoid overwhelming the server

    # Parse the HTML content of the response using BeautifulSoup
    headlines_soup = BeautifulSoup(headlines_res.content,'html.parser')
    #Finding the area containing headlines
    bodyarea = headlines_soup.find('div',id='bodyarea')
    headlines_table = (bodyarea.find('div', class_ = 'tborder')).find('table',class_='bordercolor')
    headlines = headlines_table.find_all('td',class_='windowbg')

    # Loop through each headline and scrape the relevant data
    for headline in headlines:
        if headline.find('span'):
            headline_text = headline.find('span').get_text()  # Get the text of the headline
            headline_link = headline.find('a').get('href')  # Get the href attribute (link) of the headline
            try:
              description_res = requests.get(headline_link) #Make an get() request to the specified link
              time.sleep(1) # Wait to avoid overwhelming the server
              description_soup = BeautifulSoup(description_res.content,'html.parser') # Parse the HTML content of the description response using BeautifulSoup
              #Finding the area containing description
              description = description_soup.find('div',class_='post').get_text()

            except Exception as e:
              # Handle any errors during the request
              description = "Failed to retrieve description"
            # Store the scraped headline, description, and link
            self.headline = headline_text
            self.link = headline_link
            self.description = description

            # Extract the date from the headline
            self.extract_date()

            # Append the data to the dictionary
            self.news_dict['date'].append(self.date)
            self.news_dict['headline'].append(self.headline)
            self.news_dict['headline_link'].append(self.link)
            self.news_dict['description'].append(self.description)

            # Reset variables for the next headline
            self.headline = ""
            self.description = ""
            self.link = ""
            self.date = None

    # Return the collected data as a DataFrame
    return pd.DataFrame(self.news_dict)




In [None]:
def save_to_csv(df, file_name):
    """
    Save data to a CSV file. If the file exists, it appends data.
    If the file does not exist, it creates it and writes the header.
    """
    # Check if the file already exists
    file_exists = os.path.isfile(file_name)

    # Append data if file exists, otherwise create a new file
    df.to_csv(file_name, mode='a', index=False, header=not file_exists)



In [None]:
# Initialize URL variables
current_url = "https://bitcointalk.org/index.php?board=77.0" # News headlines BitcoinTalk website base_url
next_url = ""
last_page = False
news_file_path = "data/unlabeled_news.csv"

while not last_page:
  # Scrape current page data
  current_page_scraper = BitcoinTalkScraper(current_url)
  current_page_data = current_page_scraper.scrape_data()

  # Save the scraped data to a CSV file
  save_to_csv(current_page_data, news_file_path)

  # Fetch and parse the current page to find the next page link
  homepage_response = requests.get(current_url) # Fetch the content of the first page
  time.sleep(1)
  homepage_soup = BeautifulSoup(homepage_response.content,'html.parser') # Parse the HTML content using BeautifulSoup

  # Locate the previous or next page number navigation links
  prevnext_links = (homepage_soup.find('div',id='bodyarea')).find('td',id='toppages').find_all('span',class_ = 'prevnext')

  # Determine if it's the last page (if only one link exists and it's the "Previous" link)
  if len(prevnext_links)==1 and next_url !="":
    last_page = True

  # Find the next page URL
  next_link_tag = prevnext_links[-1].find('a', class_='navPages')  # Find the "Next" link
  if next_link_tag:
      next_url = next_link_tag.get('href')
      current_url = next_url  # Set current_url to the next page's URL
      # print(f"Next URL: {next_url}")
  else:
      last_page = True  # No "Next" link means we've reached the last page



