In [6]:
import pandas as pd
import numpy as np
import wrds
from tqdm import tqdm

In [2]:
# Establish connection to WRDS
db = wrds.Connection()

Enter your WRDS username [ec2-user]: ly229
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  y


Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


## Desrciptive Stat On Funds Holding under S&P500 and Russuell 3000

#### check tables under certain library(e.g Thomson-Reuters, CRSP) in wrds to find the correct name of database used

In [3]:
# List available tables in Thomson Reuters dataset
tables = db.list_tables(library="tfn")
print(tables)

['amend', 'avgreturns', 'company', 'form144', 'header', 'idfhist', 'idfnames', 'rule10b5', 's12', 's12names', 's12type1', 's12type2', 's12type3', 's12type4', 's12type5', 's12type6', 's12type7', 's12type8', 's34', 's34names', 's34type1', 's34type2', 's34type3', 's34type4', 's34type6', 'table1', 'table2']


### TNF Funding Holding Data - master file

In [4]:
# Query to retrieve mutual fund holdings data
query_tnf = """
    SELECT 
    fundno, fundname, 
    ticker, cusip, stkname, rdate, 
    assets/100 AS assets, 
    shares/1000000 AS shares_held, 
    shrout1
    
    FROM tfn.s12
    
    WHERE rdate BETWEEN '2023-01-01' AND '2023-12-31'
    AND country = 'UNITED STATES'
"""
#counts in millions(assets, shares, shrout1)

In [10]:
#huge dataset so inlcudes progress checking while downloading

# Use a temporary query to count total rows for progress tracking
count_query = "SELECT COUNT(shrout1) FROM tfn.s12 WHERE rdate BETWEEN '2023-01-01' AND '2023-12-31' AND country = 'UNITED STATES'"

# Get the total number of rows for progress tracking
total_rows = db.raw_sql(count_query).iloc[0, 0]

# Define chunk size
chunk_size = 10000  # Number of records per chunk

# Initialize an empty DataFrame to store the results
tnf = pd.DataFrame()

In [14]:
with tqdm(total=total_rows, desc='Downloading data', unit='row') as pbar:
    print("Loop Start")
    for chunk in db.raw_sql(count_query, chunksize=chunk_size):
        print("H")
        tnf = pd.concat([tnf, chunk], ignore_index=True)  # Concatenate each chunk into the main DataFrame
        pbar.update(len(chunk))  # Update the progress bar with the number of rows in the chunk


print(f"Total records downloaded: {len(tnf)}")

Downloading data:   0%|          | 0/8549371 [00:00<?, ?row/s]

Loop Start


Downloading data:   0%|          | 0/8549371 [01:06<?, ?row/s]


KeyboardInterrupt: 

In [None]:
# Execute the query and store the result in a DataFrame
tnf = db.raw_sql(query_tnf)

# Display the first few records
print(tnf.head())

In [None]:
dt_path_1 = 'DS_data/tnf.csv' 
tnf.to_csv(dt_path_1, index=False)

### S&P500 and Russell 3000 list from CRSP

In [5]:
# Describe the table to see available columns
db.describe_table(library="crsp", table="msp500list")  # Replace with your table name

Approximately 2047 rows in crsp.msp500list.


Unnamed: 0,name,nullable,type,comment
0,permno,True,INTEGER,CRSP Permanent Number
1,start,True,DATE,Date when the stock included in S&P500 Index
2,ending,True,DATE,Date when the stock excluded from S&P500 Index


In [20]:
db.describe_table(library="crsp", table="msf")

Approximately 5037353 rows in crsp.msf.


Unnamed: 0,name,nullable,type,comment
0,cusip,True,VARCHAR(8),CUSIP Header
1,permno,True,INTEGER,PERMNO
2,permco,True,INTEGER,PERMCO
3,issuno,True,INTEGER,Nasdaq Issue Number
4,hexcd,True,SMALLINT,Exchange Code Header
5,hsiccd,True,INTEGER,Standard Industrial Classification Code
6,date,True,DATE,Date of Observation
7,bidlo,True,"NUMERIC(11, 5)",Bid or Low Price
8,askhi,True,"NUMERIC(11, 5)",Ask or High Price
9,prc,True,"NUMERIC(11, 5)",Price or Bid/Ask Average


In [30]:
sp500_list = db.raw_sql("""
                        SELECT a.*, 
                        b.date, b.cusip,
                        c.comnam, c.ticker
                        
                        FROM 
                        crsp.msp500list AS a

                        JOIN
                        crsp.msf AS b ON a.permno = b.permno
                        
                        JOIN
                        crsp.msenames AS c ON a.permno = c.permno
                        
                        WHERE 
                        b.date >= a.start 
                        AND b.date<= a.ending
                        AND b.date>='01/01/1993'
                        order by b.date;
                        """
                        )

In [31]:
sp500_list

Unnamed: 0,permno,start,ending,date,cusip,comnam,ticker
0,24643,1957-03-01,2023-12-29,1993-01-29,44320110,ALCOA INC,AA
1,45356,1989-07-27,2009-03-16,1993-01-29,G5150210,TYCO INTERNATIONAL LTD NEW,TYC
2,70578,1989-01-12,2023-12-29,1993-01-29,27886510,ECOLAB INC,ECL
3,21207,1969-05-15,2023-12-29,1993-01-29,65163910,NEWMONT GOLDCORP CORP,NEM
4,18411,1944-06-07,2023-12-29,1993-01-29,84258710,SOUTHERN CO,SO
...,...,...,...,...,...,...,...
374145,82303,2017-06-19,2023-12-29,2023-12-29,G3223R10,EVEREST RE GROUP LTD,RE
374146,77129,2006-04-03,2023-12-29,2023-12-29,49446R10,KIMCO REALTY CORP,KIM
374147,44601,1987-12-24,2023-12-29,2023-12-29,05361110,AVERY DENNISON CORP,AVY
374148,60097,1986-10-23,2023-12-29,2023-12-29,G5960L10,MEDTRONIC INC,MDT


In [12]:
# Query to retrieve Russell 3000 companies from CRSP
russell3000_query = """
    SELECT permno, start, ending
    FROM crsp.mrussell3000
"""
russell3000_list = db.raw_sql(russell3000_query)

ProgrammingError: (psycopg2.errors.UndefinedTable) relation "crsp.mrussell3000" does not exist
LINE 3:     FROM crsp.mrussell3000
                 ^

[SQL: 
    SELECT permno, start, ending
    FROM crsp.mrussell3000
]
(Background on this error at: https://sqlalche.me/e/20/f405)