In [1]:
import pandas as pd
import numpy as np
import wrds

In [2]:
###################
# Connect to WRDS #
###################
conn=wrds.Connection()

Enter your WRDS username [ec2-user]: ly229
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  y


Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


## Mrkt Cap from CRSP daily data(desampled to monthly)

In [None]:
mktcap_crsp = conn.raw_sql("""
                        SELECT DISTINCT 
                        date AS date, 
                        permno, 
                        permco, 
                        cusip,
                        ABS(prc*shrout)/1000 AS mrkcap_CRSP, 
                        SUM(ABS(prc*shrout)/1000) OVER (PARTITION BY date, permco) AS tot_mrkcap_CRSP
                        
                        FROM crsp.dsf
                        
                        WHERE EXTRACT(MONTH FROM date) IN (5) 
                        AND date >= '1999-01-01' 
                        """)

In [None]:
mktcap_crsp['date'] = pd.to_datetime(mktcap_crsp['date'])

In [None]:
mktcap_crsp_resample = mktcap_crsp
mktcap_crsp_resample.set_index('date', inplace=True)
mktcap_crsp_monthly = mktcap_crsp_resample.groupby(['permno', 'permco','cusip']).resample('ME', include_groups=False).mean().reset_index()

In [None]:
file_path_1 = 'RR_data/mktcap_crsp_monthly.csv' 
mktcap_crsp_monthly.to_csv(file_path_1, index=False)

## Mktcap from Compustat - daily security info

### 1. aggregate data by gvkey -- (gvkey-iids to gvkey)

### 2. calculate prccd_vw -- sum(mktcap_COMP*prccd)/sum(mktcap_COMP)

In [None]:
mktcap_comp = conn.raw_sql("""
                        SELECT DISTINCT 
                        datadate AS date,  
                        gvkey,
                        SUM(prccd*cshoc)/NULLIF(SUM(cshoc), 0) AS prccd,
                        SUM(cshoc)/1000000 AS cshoc,
                        SUM(prccd*cshoc*prccd)/NULLIF(SUM(prccd*cshoc), 0) AS prccd_vw
                        
                        FROM comp.secd
                        
                        WHERE EXTRACT(MONTH FROM datadate) IN (5,6) 
                        AND datadate >= '1999-01-01' 
                        AND curcdd = 'USD'
                        AND tpci in ('0')
                        
                        GROUP BY gvkey, datadate
                        """)

## resampleing to monthly frequency

In [None]:
mktcap_comp['date'] = pd.to_datetime(mktcap_comp['date'])

In [None]:
mktcap_comp_resample = mktcap_comp
mktcap_comp_resample['date'] = pd.to_datetime(mktcap_comp_resample['date'])
mktcap_comp_resample.set_index('date', inplace=True)
mktcap_comp_re_monthly = mktcap_comp_resample.groupby('gvkey').resample('ME', include_groups=False).mean().reset_index().dropna()

In [None]:
mktcap_comp_monthly= mktcap_comp_re_monthly.rename(columns={
    'prccd': 'prccm',
    'cshoc': 'cshom',
    'prccd_vw': 'prccm_vw'})

In [None]:
mktcap_comp_monthly['tot_mktcap_comp'] = mktcap_comp_monthly['prccm'] * mktcap_comp_monthly['cshom']

In [None]:
file_path_2 = 'RR_data/mktcap_comp_monthly.csv' 
mktcap_comp_monthly.to_csv(file_path_2, index=False)

## Quarterly Security Info - cshoq

### get all shares info(nontraded share classes) from quarterly financial report

In [None]:
cshoq_2 = conn.raw_sql("""
                        SELECT 
                        a.gvkey, a.datadate, a.cshoq, 
                        b.iid, b.ajexm AS ajexq
                        
                        FROM comp.fundq AS a
                        
                        LEFT JOIN comp.secm AS b
                        
                        ON a.gvkey = b.gvkey
                        AND DATE_TRUNC('month', a.datadate) + INTERVAL '1 MONTH - 1 day' = DATE_TRUNC('month', b.datadate) + INTERVAL '1 MONTH - 1 day'

                        WHERE a.cshoq IS NOT NULL
                        """)

In [None]:
cshoq_2['datadate'] = pd.to_datetime(cshoq_2['datadate'])

In [None]:
#select distinct
cshoq_m = cshoq_2_drop.drop_duplicates(subset=['gvkey','iid' , 'datadate', 'cshoq', 'ajexq'])
cshoq_m

In [None]:
cshoq_sort = cshoq_m.sort_values(by=['gvkey', 'iid', 'datadate']).reset_index().drop('index', axis=1)
cshoq_group = cshoq_sort.groupby(['gvkey', 'iid', 'datadate'])
idx_max = cshoq_group['cshoq'].idxmax()
cshoq_max = cshoq_sort.loc[idx_max].reset_index(drop=True)
cshoq_max

In [None]:
cshoq_max= cshoq_max.rename(columns={'datadate': 'date'})

In [None]:
cshoq_merge = cshoq_max.groupby(['gvkey', 'date']).agg({
    'cshoq': 'mean',  
    'ajexq': 'mean',   
}).reset_index()
cshoq_merge

In [None]:
cshoq_merge.set_index('date', inplace=True)

# Perform forward fill within each group/Upsample to monthly frequency
cshoq_ffill = cshoq_merge.groupby('gvkey').resample('ME', include_groups=False).ffill().reset_index()

In [None]:
cshoq_ffill['date'] = pd.to_datetime(cshoq_ffill['date'])
cshoq_ffill['year'] = cshoq_ffill['date'].dt.year
cshoq_ffill['month_day'] = cshoq_ffill['date'].dt.strftime('%m-%d')

#keep only the May data of each year
russell_month_day = '05-31'
cshoq_ffill_may = cshoq_ffill[cshoq_ffill['month_day'] == russell_month_day]

In [None]:
file_path_3 = 'RR_data/cshoq_ffill_may.csv' 
cshoq_ffill_may.to_csv(file_path_3, index=False)

## Monthly Security Info

### add ajexm and cusip(first 6 digits for company id) info to mktcap_comp

In [None]:
secm_1 = conn.raw_sql("""
                        SELECT DISTINCT
                        gvkey,
                        LEFT(cusip, 6) AS cusip_co,
                        datadate AS date, 
                        ajexm
                        
                        FROM comp.secm
                        
                        WHERE EXTRACT(MONTH FROM datadate) IN (5,6)
                        AND datadate >= '1999-01-01' 
                        AND curcdm = 'USD'
                        AND tpci IN ('0')
                        
                        """)

In [None]:
secm_1['date'] = pd.to_datetime(secm_1['date'])

In [None]:
mktcap_comp_mm = pd.merge(mktcap_comp_monthly, secm_1, on=['gvkey', 'date'], how='left')

### Dealing with Nontraded Shares

In [None]:
mktcap_comp_mm['month'] = mktcap_comp_mm['date'].dt.month

# Filter to get only May data (month == 5)
mktcap_comp_may = mktcap_comp_mm[mktcap_comp_mm['month'] == 5]

# Optionally, drop the 'month' column if you don't need it
mktcap_comp_may = mktcap_comp_may.drop(columns=['month'])

In [None]:
#find nontraded shares
mktcap_cshoq = pd.merge(mktcap_comp_may, cshoq_ffill_may, on=['gvkey', 'date'], how='left')

In [None]:
file_path_4 = 'RR_data/mktcap_cshoq.csv' 
mktcap_cshoq.to_csv(file_path_4, index=False)

In [None]:
columns_to_drop = ['year', 'month_day']
mktcap_cshoq_all = mktcap_cshoq.drop(columns=columns_to_drop)

In [None]:
# fill NAs for ajexm and ajexq
mktcap_cshoq_all['ajexm'] = mktcap_cshoq_all['ajexm'].fillna(1)
mktcap_cshoq_all['ajexq'] = mktcap_cshoq_all['ajexq'].fillna(mktcap_cshoq['ajexm'])

In [None]:
mktcap_cshoq_all['cshoq'] = mktcap_cshoq_all['cshoq'].fillna(0)

In [None]:
# Calculating tot_mktcap_COMP_ALL
mktcap_cshoq_all['tot_mktcap_COMP_ALL'] = ((mktcap_cshoq_all['cshoq'] * mktcap_cshoq_all['ajexq'] - mktcap_cshoq_all['cshom'] * mktcap_cshoq_all['ajexm']) * mktcap_cshoq_all['prccm_vw'] / mktcap_cshoq_all['ajexm'] 
                             + mktcap_cshoq_all['tot_mktcap_comp'])

# If cshoq * ajexq - tot_cshom * ajexm < 0, set tot_mktcap_COMP_ALL to tot_mktcap_COMP
condition = (mktcap_cshoq_all['cshoq'] * mktcap_cshoq_all['ajexq'] - mktcap_cshoq_all['cshom'] * mktcap_cshoq_all['ajexm']) < 0
mktcap_cshoq_all.loc[condition, 'tot_mktcap_COMP_ALL'] = mktcap_cshoq_all['tot_mktcap_comp']

In [None]:
file_path_5 = 'RR_data/mktcap_comp_all.csv' 
mktcap_cshoq_all.to_csv(file_path_5, index=False)

### Merging and Ranking

In [None]:
mktcap_crsp_monthly['cusip_co'] = mktcap_crsp_monthly['cusip'].astype(str).str[:6]

In [None]:
Russell_1 = pd.merge(mktcap_crsp_monthly, mktcap_cshoq_all, on=['cusip_co', 'date'], how='left')

In [None]:
# make an individual copy of the original dataframe
Russell_2 = Russell_1.copy()

# 1. Use CRSP market cap
Russell_2['tot_mktcap_r3'] = Russell_2['tot_mrkcap_crsp']

# 2. If missing, use Compustat market cap
Russell_2['tot_mktcap_r3'] = Russell_2['tot_mktcap_r3'].fillna(Russell_2['tot_mktcap_comp'])

# 3. Use Compustat total market cap (tot_mktcap_COMP_ALL) if it's higher due to OTC shares or non-tradable shares
Russell_2['tot_mktcap_r3'] = Russell_2.apply(
    lambda row: row['tot_mktcap_COMP_ALL'] if row['tot_mktcap_COMP_ALL'] > row['tot_mktcap_r3'] else row['tot_mktcap_r3'],
    axis=1
)

In [None]:
Russell_2['mkt_value'] = Russell_2['tot_mktcap_r3']
Russell_2['mkt_value'] = pd.to_numeric(Russell_2['mkt_value'], errors='coerce')

In [None]:
Russell_2_filtered = Russell_2[Russell_2['mkt_value'] > 0]

# descending order
Russell_2_filtered['Rank'] = Russell_2_filtered.groupby('date')['tot_mktcap_r3'].rank(ascending=False)

Russell_3 = Russell_2_filtered.copy()
Russell_3.reset_index(drop=True, inplace=True)

In [None]:
file_path_6 = 'RR_data/Russell_3_rank.csv' 
Russell_3.to_csv(file_path_5, index=False)