In [1]:
# import libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import yfinance as yf

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# 1: ESG dataset from Kaggle

In [3]:
esg = pd.read_csv('data/raw/SP 500 ESG Risk Ratings.csv')

In [4]:
esg.head(10)

Unnamed: 0,Symbol,Name,Address,Sector,Industry,Full Time Employees,Description,Total ESG Risk score,Environment Risk Score,Governance Risk Score,Social Risk Score,Controversy Level,Controversy Score,ESG Risk Percentile,ESG Risk Level
0,A,Agilent Technologies Inc,"5301 Stevens Creek Boulevard\nSanta Clara, CA ...",Healthcare,Diagnostics & Research,18000,"Agilent Technologies, Inc. provides applicatio...",15.0,0.3,6.3,8.6,Low,1.0,11th percentile,Low
1,AAL,American Airlines Group Inc,"1 Skyview Drive\nFort Worth, TX 76155\nUnited ...",Industrials,Airlines,132500,"American Airlines Group Inc., through its subs...",29.0,12.0,5.0,12.0,Moderate,2.0,62nd percentile,
2,AAP,Advance Auto Parts Inc,"4200 Six Forks Road\nRaleigh, NC 27609\nUnited...",Consumer Cyclical,Specialty Retail,40000,"Advance Auto Parts, Inc. provides automotive r...",12.0,0.0,3.0,8.0,Moderate,2.0,4th percentile,Negligible
3,AAPL,Apple Inc,"One Apple Park Way\nCupertino, CA 95014\nUnite...",Technology,Consumer Electronics,164000,"Apple Inc. designs, manufactures, and markets ...",17.0,0.6,9.2,6.9,Significant,3.0,15th percentile,Low
4,ABBV,Abbvie Inc,"1 North Waukegan Road\nNorth Chicago, IL 60064...",Healthcare,Drug Manufacturers—General,50000,"AbbVie Inc. discovers, develops, manufactures,...",28.0,1.1,9.9,16.8,Significant,3.0,55th percentile,Medium
5,ABC,Amerisourcebergen Corp,"1 West First Avenue\nConshohocken, PA 19428-18...",Healthcare,Medical Distribution,46000,AmerisourceBergen Corporation sources and dist...,12.0,1.3,5.2,5.6,Significant,3.0,5th percentile,Low
6,ABT,Abbott Laboratories,100 Abbott Park Road\nAbbott Park\nNorth Chica...,Healthcare,Medical Devices,115000,"Abbott Laboratories, together with its subsidi...",25.0,3.0,8.4,13.6,Significant,3.0,44th percentile,Medium
7,ACGL,Arch Capital Group Ltd,Waterloo House\nGround Floor 100 Pitts Bay Roa...,Financial Services,Insurance—Diversified,5800,"Arch Capital Group Ltd., together with its sub...",21.0,1.0,12.0,7.0,Moderate,2.0,28th percentile,
8,ACN,Accenture Plc Cl A,1 Grand Canal Square\nGrand Canal Harbour\nDub...,Technology,Information Technology Services,732000,"Accenture plc, a professional services company...",10.0,0.3,4.8,4.6,Moderate,2.0,2nd percentile,Negligible
9,ADBE,Adobe Inc,"345 Park Avenue\nSan Jose, CA 95110-2704\nUnit...",Technology,Software—Infrastructure,29239,"Adobe Inc., together with its subsidiaries, op...",12.0,1.9,4.6,5.9,Low,1.0,5th percentile,Low


# 2: GDP from Wikipedia

In [5]:
url = "https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_GDP"

response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

table = soup.find('table', {'class': 'wikitable'})

state_list = []
gdp_2022_list = []
gdp_2023_list = []

for row in table.find_all('tr')[1:]:  
    columns = row.find_all(['th', 'td'])
    state = columns[0].text.strip()
    gdp_2022 = columns[1].text.strip()
    gdp_2023 = columns[2].text.strip()
    
    state_list.append(state)
    gdp_2022_list.append(gdp_2022)
    gdp_2023_list.append(gdp_2023)
    

gdp_df = pd.DataFrame({'state':state_list,
                      'gdp_2022':gdp_2022_list,
                      'gdp_2023':gdp_2023_list})

In [6]:
len(gdp_df)

54

In [7]:
gdp_df.head(10)

Unnamed: 0,state,gdp_2022,gdp_2023
0,2022,2023.0,2022.0
1,,,
2,California *,3598103.0,3755487.0
3,Texas *,2355960.0,2436346.0
4,New York *,2053180.0,2135672.0
5,Florida *,1389070.0,1468015.0
6,Illinois *,1033310.0,1071552.0
7,Pennsylvania *,923089.0,961946.0
8,Ohio *,822670.0,852903.0
9,Georgia *,755698.0,792151.0


In [43]:
# already stored the data under "processed" folder
#gdp_df.to_excel('data/raw/gdp_raw.xlsx', index=False)

# 3: U.S State Name & Abbreviations from Wikipedia

In [8]:
url = "https://en.wikipedia.org/wiki/List_of_U.S._state_and_territory_abbreviations"

response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

table = soup.find_all('table')[0]

state_list = []
state_abr_list = []

for row in table.find_all('tr')[11:]:
    cols = row.find_all(['th', 'td'])
    if len(cols) >= 3:  
        state = cols[0].get_text(strip=True)
        state_abr = cols[2].get_text(strip=True)
        
        state_list.append(state)
        state_abr_list.append(state_abr)
        
        
state_name_df = pd.DataFrame({'state':state_list,
                              'state_abbreviation':state_abr_list})

In [9]:
state_name_df

Unnamed: 0,state,state_abbreviation
0,United States of America,USUSA840
1,Alabama,US-AL
2,Alaska,US-AK
3,Arizona,US-AZ
4,Arkansas,US-AR
...,...,...
73,Nebraska,
74,Northern Mariana Islands,
75,Panama Canal Zone,PZPCZ594
76,Philippine Islands,PHPHL608[9]


In [64]:
# already stored the data under "processed" folder
#state_name_df.to_excel('data/raw/state_name_raw.xlsx', index=False)

# 4: Finanical data from Yahoo API

In [35]:
company_symbol_list = esg['Symbol'].tolist()

In [36]:
# generate a basic understanding of what data is missing in the company_symbol_list
# not really accurate cuz some info exists but equal 0
market_cap_list = []
stock_volatility_list = []
latest_revenue_list = []

for company_symbol in company_symbol_list:
    company_data = yf.Ticker(company_symbol)

    # market value
    try:
        market_cap = company_data.info["marketCap"]
    except:
        market_cap = 0
        print(company_symbol,'- market value not found')
    
    # revenue
    try:
        financials_df = company_data.financials
        latest_revenue = financials_df.loc['Total Revenue'][0]
    except:
        latest_revenue = 0
        print(company_symbol,'- revenue not found')
        
    # stock volatility
    try:
        historical_data = company_data.history(period="1y")
        historical_data["Daily_Return"] = historical_data["Close"].pct_change()
        stock_volatility = historical_data["Daily_Return"].std()
    except:
        stock_volatility = 0
        print(company_symbol,'-stock volatility not found')
    
    market_cap_list.append(market_cap)
    latest_revenue_list.append(latest_revenue)
    stock_volatility_list.append(stock_volatility)

ABC - market value not found
ABC - revenue not found


ABC: No data found, symbol may be delisted
BF.B: No price data found, symbol may be delisted (period=1y)


BF.B - market value not found
BF.B - revenue not found


In [37]:
fin_df = pd.DataFrame({'symbol':company_symbol_list,
                       'market_value':market_cap_list,
                       'latest_revenue':latest_revenue_list,
                       'stock_volatility':stock_volatility_list})
fin_df

Unnamed: 0,symbol,market_value,latest_revenue,stock_volatility
0,A,33349068800,6.848000e+09,0.016883
1,AAL,8103907840,4.897100e+10,0.023060
2,AAP,3153115392,1.115472e+10,0.031097
3,AAPL,2977583333376,3.832850e+11,0.014296
4,ABBV,244191821824,5.805400e+10,0.012325
...,...,...,...,...
498,YUM,35913060352,6.842000e+09,0.010160
499,ZBH,23510362112,6.939900e+09,0.013304
500,ZBRA,11326845952,5.781000e+09,0.025469
501,ZION,5300770816,3.152000e+09,0.039206


In [39]:
# already stored the data under "processed" folder
# fin_df.to_excel('data/raw/fin_raw.xlsx', index=False)