In [1]:
import sqlite3
import yfinance as yf
import pandas as pd
import numpy as np

In [2]:
conn = sqlite3.connect('sp500_stocks.db')
cursor = conn.cursor()

# Sectors, industry and sub-industry

In [3]:
mapping = pd.read_csv('gics_mapping.csv')
mapping.head()

Unnamed: 0,sector_id,sector,industry_id,industry,sub_industry_id,sub_industry
0,1,Energy,101,Energy Equipment & Services,10101,Oil & Gas Drilling
1,1,Energy,101,Energy Equipment & Services,10102,Oil & Gas Equipment & Services
2,1,Energy,102,"Oil, Gas & Consumable Fuels",10201,Integrated Oil & Gas
3,1,Energy,102,"Oil, Gas & Consumable Fuels",10202,Oil & Gas Exploration & Production
4,1,Energy,102,"Oil, Gas & Consumable Fuels",10203,Oil & Gas Refining & Marketing


In [4]:
#Populate sectors
sector_data = mapping[['sector_id', 'sector']].drop_duplicates()
sector_insert_data = []
for i in range(0, sector_data.shape[0]):
    # Ensure sector_id is int and sector is str
    sector_insert_data.append((int(sector_data['sector_id'].iloc[i]), str(sector_data['sector'].iloc[i])))
insert_sectors = """
INSERT OR IGNORE INTO sectors (sector_id, sector) 
VALUES (?, ?)"""

cursor.executemany(insert_sectors, sector_insert_data)

<sqlite3.Cursor at 0x18417e381c0>

In [5]:
type(sector_insert_data[0][0])

int

In [None]:
# Populate industries
industry_data = mapping[['sector_id', 'industry_id', 'industry']].drop_duplicates()
industry_insert_data = []
for i in range(0, industry_data.shape[0]):
    industry_insert_data.append((int(industry_data['sector_id'].iloc[i]), int(industry_data['industry_id'].iloc[i]), str(industry_data['industry'].iloc[i])))
insert_industry = """
INSERT OR IGNORE INTO industries (sector_id, industry_id, industry) 
VALUES (?, ?, ?)"""
cursor.executemany(insert_industry, industry_insert_data)

<sqlite3.Cursor at 0x18417e381c0>

In [None]:
# Populate sub-industries
sub_industry_data = mapping[['sector_id', 'industry_id', 'sub_industry_id', 'sub_industry']].drop_duplicates()
sub_industry_insert_data = []
for i in range(0, sub_industry_data.shape[0]):
    sub_industry_insert_data.append((int(sub_industry_data['sector_id'].iloc[i]), int(sub_industry_data['industry_id'].iloc[i]), int(sub_industry_data['sub_industry_id'].iloc[i]), str(sub_industry_data['sub_industry'].iloc[i])))
insert_sub_industry = """
INSERT OR IGNORE INTO sub_industries (sector_id, industry_id, sub_industry_id, sub_industry) 
VALUES (?, ?, ?, ?)"""
cursor.executemany(insert_sub_industry, sub_industry_insert_data)

<sqlite3.Cursor at 0x18417e381c0>

In [8]:
sub_industry_data.loc[sub_industry_data['sub_industry_id'].duplicated(keep=False),:]

Unnamed: 0,sector_id,industry_id,sub_industry_id,sub_industry


In [10]:
cursor.execute("SELECT * FROM sectors")
sectors = cursor.fetchall() 
sectors_df = pd.DataFrame(sectors, columns=['sector_id', 'sector'])
sectors_df

Unnamed: 0,sector_id,sector
0,1,Energy
1,2,Materials
2,3,Industrials
3,4,Utilities
4,5,Health Care
5,6,Financials
6,7,Consumer Discretionary
7,8,Consumer Staples
8,9,Information Technology
9,10,Communication Services


In [11]:
cursor.execute("SELECT * FROM industries")
industries = cursor.fetchall()
industries_df = pd.DataFrame(industries, columns=['industry_id', 'sector_id', 'industry'])
industries_df

Unnamed: 0,industry_id,sector_id,industry
0,1,101,Energy Equipment & Services
1,1,102,"Oil, Gas & Consumable Fuels"
2,2,201,Chemicals
3,2,202,Construction Materials
4,2,203,Containers & Packaging
...,...,...,...
68,11,1105,Health Care REITs
69,11,1106,Residential REITs
70,11,1107,Retail REITs
71,11,1108,Specialized REITs


In [None]:
cursor.execute("SELECT * FROM sub_industries")
sub_industries = cursor.fetchall()
sub_industries_df = pd.DataFrame(sub_industries, columns=['sub_industry_id', 'sector_id', 'industry_id', 'sub_industry'])
sub_industries_df

Unnamed: 0,sub_industry_id,sector_id,industry_id,sub_industry
0,1,101,10101,Oil & Gas Drilling
1,1,101,10102,Oil & Gas Equipment & Services
2,1,102,10201,Integrated Oil & Gas
3,1,102,10202,Oil & Gas Exploration & Production
4,1,102,10203,Oil & Gas Refining & Marketing
...,...,...,...,...
158,11,1108,110805,Data Center REITs
159,11,1109,110901,Diversified Real Estate Activities
160,11,1109,110902,Real Estate Operating Companies
161,11,1109,110903,Real Estate Development


# Stock Information

In [None]:
sp_comp_tickers = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]['Symbol'].to_list()
sp_comp_data = []
for ticker in sp_comp_tickers:
    ticker = ticker.replace('.', '-')  # Replace '.' with '-' for yfinance compatibility
    print(f"Fetching data for {ticker}...")
    stock = yf.Ticker(ticker).info
    sp_comp_data.append((stock['symbol'], stock['shortName'], stock['sector'], stock['industry'], stock['marketCap'], stock['longBusinessSummary']))

Fetching data for MMM...
Fetching data for AOS...
Fetching data for ABT...
Fetching data for ABBV...
Fetching data for ACN...
Fetching data for ADBE...
Fetching data for AMD...
Fetching data for AES...
Fetching data for AFL...
Fetching data for A...
Fetching data for APD...
Fetching data for ABNB...
Fetching data for AKAM...
Fetching data for ALB...
Fetching data for ARE...
Fetching data for ALGN...
Fetching data for ALLE...
Fetching data for LNT...
Fetching data for ALL...
Fetching data for GOOGL...
Fetching data for GOOG...
Fetching data for MO...
Fetching data for AMZN...
Fetching data for AMCR...
Fetching data for AEE...
Fetching data for AEP...
Fetching data for AXP...
Fetching data for AIG...
Fetching data for AMT...
Fetching data for AWK...
Fetching data for AMP...
Fetching data for AME...
Fetching data for AMGN...
Fetching data for APH...
Fetching data for ADI...
Fetching data for AON...
Fetching data for APA...
Fetching data for APO...
Fetching data for AAPL...
Fetching data f

In [None]:
insert_stock_info = """
INSERT OR IGNORE INTO companies (ticker, name, sector, industry, market_cap, description)
VALUES (?, ?, ?, ?, ?, ?)"""
cursor.executemany(insert_stock_info, sp_comp_data)

<sqlite3.Cursor at 0x27bf5e401c0>

## Matching companies sectors and industries to according tables

In [None]:
cursor.execute("SELECT * FROM industries WHERE sector = 8")
sectors = cursor.fetchall()
industries_df = pd.DataFrame(sectors, columns=['industry_id', 'sector_id', 'industry'])
industries_df

Unnamed: 0,industry_id,sector_id,industry
0,801,8,Consumer Staples Distribution & Retail
1,802,8,Beverages
2,803,8,Food Products
3,804,8,Tobacco
4,805,8,Household Products
5,806,8,Personal Products


In [None]:
update_statement = """
    UPDATE companies
    SET sector = CASE sector
        WHEN 'Energy' THEN 1
        WHEN 'Basic Materials' THEN 2
        WHEN 'Industrials' THEN 3
        WHEN 'Utilities' THEN 4
        WHEN 'Healthcare' THEN 5
        WHEN 'Financial Services' THEN 6
        WHEN 'Consumer Cyclical' THEN 7
        WHEN 'Consumer Defensive' THEN 8
        WHEN 'Technology' THEN 9
        WHEN 'Communication Services' THEN 10
        WHEN 'Real Estate' THEN 11
        ELSE sector
    END
"""
cursor.execute(update_statement)

<sqlite3.Cursor at 0x27bf5e401c0>

In [None]:
test_query = """
SELECT * FROM companies
ORDER BY market_cap DESC
"""
cursor.execute(test_query)
top_companies = cursor.fetchall()
top_companies_df = pd.DataFrame(top_companies, columns=['ticker', 'name', 'sector', 'industry', 'market_cap', 'description'])
len(top_companies_df['industry'].unique())

114

In [None]:
# Commit changes and close the connection
conn.commit()
conn.close()

ProgrammingError: Cannot operate on a closed database.