In [1]:
# Helper Functions

### Libraries Used 

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import os
import us
import time
import requests
import concurrent.futures

### Configuration and Custom Helper Functions

In [None]:
# Configuration
nb_of_pages = 62
pd.set_option('display.max_colwidth', None)  # Set to None to display all text in the cell

In [None]:
#custom function to get article info
def get_article_info(page_number):
    try:
        time.sleep(1)  # Respectful scraping with a pause
        link = f"https://www.refinery29.com/en-us/money-diary?page={page_number}"
        response = requests.get(link)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        html = BeautifulSoup(response.content, 'html.parser')
        
        # Extracting titles and URLs based on HTML structure
        titles = [span.text for span in html.select(".title span")]
        regular_article_urls = [a['href'] for a in html.select(".card a[href]")]
        article_url_hero = [a['href'] for a in html.select(".hero-card-full-width a[href]")]
        article_urls = article_url_hero + regular_article_urls
        urls = ["https://www.refinery29.com" + url for url in article_urls]
        
        # Filter out unwanted titles
        filtered_titles = [title for title in titles if title not in ["All Money Diaries", "The Secret Sauce To A Successful Budget"]]
        
        if filtered_titles and urls:
            return pd.DataFrame({'url': urls, 'title': filtered_titles})
        else:
            print(f"No relevant data found on page {page_number}.")
            return None
        
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred on page {page_number}: {http_err}")
        return None
    except Exception as err:
        print(f"An error occurred on page {page_number}: {err}")
        return None

In [None]:
#custom function to get article text 
def get_article_text(url):
    response = requests.get(url)
    html = BeautifulSoup(response.content, 'html.parser')
    return html.select(".section-text")

In [None]:
#custom function to extract salary
def extract_salary(title):
    title_lower = title.lower()
    
    if "hour" in title_lower or "month" in title_lower:
        salary = re.search(r"\$\d+(?:\.\d+)?", title_lower)
        if salary:
            return float(salary.group().replace('$', '').replace(',', '')) * 1000
    
    elif "million" in title_lower:
        salary = re.search(r"(\d+\.?\d*) million", title_lower)
        if salary:
            return float(salary.group(1).replace('.', '')) * 1000000
    
    else:
        salary = re.search(r"\$\d+k?", title_lower)
        if salary:
            return float(salary.group().replace('k', '000').replace('$', '').replace(',', ''))
    
    #return None

In [None]:
#custom function to extract state
def extract_state(location):
    state_name = next((state for state in us.states.STATES if state.name in location), None)
    
    if state_name and "New York" not in location and "Washington" not in location and "Kansas City" not in location:
        return state_name.abbr
    elif "New York City" in location or "NYC" in location:
        return "NY"
    else:
        match = re.search(r"(?<=, )\w+", location)
        return match.group(0) if match else None

In [None]:
#custom function to get monthly expenses 
def get_monthly_expenses(article_text):
    # Initialize the list to hold expenses
    expenses = []
    
    # Parse the article text using BeautifulSoup
    soup = BeautifulSoup(article_text, "html.parser")
    
    # Find the section that contains "Monthly Expenses"
    monthly_expenses_section = soup.find_all(text=re.compile(r'Monthly Expenses'))

    for section in monthly_expenses_section:
        # Get the text following "Monthly Expenses"
        parent = section.find_parent()
        if parent:
            # Extract all text following "Monthly Expenses"
            text_content = parent.get_text(separator=' ', strip=True)
            
            # Clean and extract the expenses
            cleaned = re.sub(r',', '', text_content)
            extracted = re.findall(r"[^\.\(\):\d]*?: \$\d+", cleaned) #regex to scrape in the form Catergory: 
            expenses.extend(extracted)
    
    return expenses if expenses else None

In [None]:
#custom function to get age 
def get_age(article_text):
    # Ensure article_text is a list of strings
    if isinstance(article_text, str):
        article_text = article_text.splitlines()
    
    # Find the line that contains 'Age:'
    age_text = next((line for line in article_text if 'Age:' in line), None)
    
    if age_text:
        # Debugging print statement
        print(f"Found line: {age_text}")
        
        # Adjust the regex to be more flexible
        match = re.search(r"Age:\s*(\d+)", age_text)
        if match:
            return int(match.group(1))
    
    return None

In [None]:
#custom function to get occupation
def get_occupation(article_text):
    # Search for the 'Occupation:' label within the entire text
    occupation = re.search(r"Occupation:\s*(.+?)(?=Industry|Age|Location|Salary|,|$)", article_text, re.DOTALL)
    
    if occupation:
        occupation_str = occupation.group(1)
        
        # Clean up any trailing unwanted text that might have been captured
        occupation_str = re.sub(r"\b(Industry|Age|Location|Salary)\b.*", "", occupation_str)
        
        return occupation_str.strip()
    
    return None

In [None]:
#custom function to get industry
def get_industry(article_text):
    # Search for the 'Industry:' label within the entire text
    industry = re.search(r"Industry:\s*(.+?)(?=Age|Location|Salary|,|$)", article_text, re.DOTALL)
    
    if industry:
        industry_str = industry.group(1)
        
        # Clean up any trailing unwanted text that might have been captured
        industry_str = re.sub(r"\b(Age|Location|Salary)\b.*", "", industry_str)
        
        return industry_str.strip()
    
    return None

In [None]:
#custom function to get weekly spend 
def get_weekly_spend(article_text):
    # Find all occurrences of 'Daily Total: $' followed by the amount
    amounts = re.findall(r"Daily Total:\s*\$\s*([\d,]+\.\d{2}|\d+)", article_text)
    
    total_spend = 0.0
    for amount in amounts:
        # Sum the amounts after removing commas
        total_spend += float(amount.replace(",", ""))
    
    return total_spend

In [None]:
#custom function to get rent/morgtage 
def get_mortgage_rent(expenses):
    expenses_df = pd.DataFrame(expenses, columns=['Type', 'Amount'])
    rent_mortgage = expenses_df[expenses_df['Type'].str.contains("Rent|Mortgage", case=False, na=False) & 
                                ~expenses_df['Type'].str.contains("insurance", case=False, na=False)]
    return rent_mortgage['Amount'].astype(float).tolist()