In [5]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import yfinance as yf

base_url = "https://aflcio.org/paywatch/company-pay-ratios?page={}"

all_rows = []
page_number = 0

while True:
    url = base_url.format(page_number)
    print(f"Scraping page {page_number} -> {url}")
    
    response = requests.get(url)
    if response.status_code != 200:

        print(f"Page {page_number} returned status {response.status_code}. Stopping.")
        break
    
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find("table")
    if not table:
        print("No table found on this page. Stopping.")
        break

    headers = []
    thead = table.find("thead")
    if thead:
        headers = [th.get_text(strip=True) for th in thead.find_all("th")]
    
    tbody = table.find("tbody")
    if not tbody:
        print("No table body found. Stopping.")
        break
    
    rows = tbody.find_all("tr")
    if not rows:
        print("No data rows found on this page. Stopping.")
        break
    
    for row in rows:
        cols = row.find_all("td")
        row_data = [col.get_text(strip=True) for col in cols]
        all_rows.append(row_data)

    page_number += 1
df = pd.DataFrame(all_rows, columns=headers if headers else None)
print(f"Total rows scraped: {len(df)}")


Scraping page 0 -> https://aflcio.org/paywatch/company-pay-ratios?page=0
Scraping page 1 -> https://aflcio.org/paywatch/company-pay-ratios?page=1
Scraping page 2 -> https://aflcio.org/paywatch/company-pay-ratios?page=2
Scraping page 3 -> https://aflcio.org/paywatch/company-pay-ratios?page=3
Scraping page 4 -> https://aflcio.org/paywatch/company-pay-ratios?page=4
Scraping page 5 -> https://aflcio.org/paywatch/company-pay-ratios?page=5
Scraping page 6 -> https://aflcio.org/paywatch/company-pay-ratios?page=6
Scraping page 7 -> https://aflcio.org/paywatch/company-pay-ratios?page=7
Scraping page 8 -> https://aflcio.org/paywatch/company-pay-ratios?page=8
Scraping page 9 -> https://aflcio.org/paywatch/company-pay-ratios?page=9
Scraping page 10 -> https://aflcio.org/paywatch/company-pay-ratios?page=10


KeyboardInterrupt: 

In [28]:
tickers = df['Ticker'].to_list()
tickers

['NUS',
 'ANF',
 'COTY',
 'MAT',
 'YUMC',
 'UVV',
 'CNXC',
 'AMC',
 'SKX',
 'GPS',
 'ROST',
 'AEO',
 'UA',
 'KO',
 'LEA',
 'CHTR',
 'APTV',
 'ACN',
 'FLEX',
 'TJX',
 'CMG',
 'MAN',
 'ALGN',
 'ON',
 'ADV',
 'KTB',
 'FN',
 'MCD',
 'YUM',
 'DKS',
 'BBWI',
 'EDR',
 'BKE',
 'JBL',
 'HBI',
 'BURL',
 'FCFS',
 'SANM',
 'SBUX',
 'UFPT',
 'AEIS',
 'ALGM',
 'RCM',
 'WMT',
 'NKE',
 'ULTA',
 'WDC',
 'RCL',
 'STX',
 'EXLS',
 'QSR',
 'THRM',
 'DRVN',
 'PLYA',
 'TGLS',
 'CNK',
 'LULU',
 'FIVE',
 'LYV',
 'CCL',
 'LEVI',
 'ACMR',
 'CG',
 'PSMT',
 'CRI',
 'COMM',
 'APP',
 'PVH',
 'BIG',
 'AYI',
 'ADNT',
 'IIVI',
 'LITE',
 'PANW',
 'FL',
 'KSS',
 'TGT',
 'AMKR',
 'AZO',
 'BOOT',
 'FDP',
 'TPG',
 'KN',
 'MELI',
 'AXP',
 'AAPL',
 'BG',
 'SIX',
 'FICO',
 'PEP',
 'APH',
 'EL',
 'LFUS',
 'MDLZ',
 'NWL',
 'WW',
 'RBC',
 'CASY',
 'ST',
 'BCO',
 'LAUR',
 'GCO',
 'DIS',
 'GOLF',
 'TEL',
 'CBRL',
 'COOP',
 'CHRW',
 'AAP',
 'ARMK',
 'CZR',
 'LOW',
 'CTSH',
 'BH',
 'HLT',
 'MUSA',
 'LVS',
 'SWKS',
 'ATH',
 'ADI',
 'V

In [7]:
import yfinance as yf

In [29]:
last_year_rd_dict = {}

for ticker in tickers:
    try:
        # Pull the annual financials DataFrame
        financials = yf.Ticker(ticker).financials  
        
        # Check if "Research And Development" row exists
        if "Research And Development" in financials.index:
            # The leftmost column is financials.columns[0]
            latest_date_col = financials.columns[0]
            
            # Retrieve the R&D expense for the leftmost column
            last_year_rd = financials.loc["Research And Development", latest_date_col]
        else:
            last_year_rd = None
        
    except Exception as e:
        print(f"Error retrieving data for {ticker}: {e}")
        last_year_rd = None
    
    # Store result in a dictionary
    last_year_rd_dict[ticker] = last_year_rd

# Convert dictionary to a DataFrame
last_year_rd_df = pd.DataFrame(list(last_year_rd_dict.items()), 
                               columns=["Ticker", "Last_Year_RD"])



In [33]:
last_year_rd_df.dropna(inplace=True)
last_year_rd_df.to_csv("R&D.csv")

In [5]:
df.to_csv('pay_ratio.csv')