# Generate the Russell Rank Proxy

In [1]:
import pandas as pd
import numpy as np
import wrds

## Conncet to WRds cloud

In [50]:
###################
# Connect to WRDS #
###################
conn=wrds.Connection()

Enter your WRDS username [lu]: ly229
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  y


Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


### Create Market Captialization Variables using CRSP daily stock data

In [43]:
mktcap_crsp = conn.raw_sql("""
                        SELECT DISTINCT 
                        date AS date, 
                        permno, 
                        permco, 
                        ABS(prc*shrout)/1000 AS mrkcap_CRSP, 
                        SUM(ABS(prc*shrout)/1000) OVER (PARTITION BY date, permco) AS tot_mrkcap_CRSP
                        
                        FROM crsp.dsf
                        
                        WHERE EXTRACT(MONTH FROM date) IN (5,6) 
                        AND date >= '2000-05-15' 
                        AND date <= '2008-06-30'
                        """)

In [44]:
mktcap_crsp['date'] = pd.to_datetime(mktcap_crsp['date'])
mktcap_crsp_resample = mktcap_crsp
mktcap_crsp_resample.set_index('date', inplace=True)
mktcap_crsp_monthly = mktcap_crsp_resample.groupby(['permno', 'permco']).resample('ME', include_groups=False).mean().reset_index()
mktcap_crsp_monthly = mktcap_crsp_monthly.dropna()

In [79]:
#print(mktcap_crsp.head(20))
###################
#Count unique combinations of "date, permco" #
## mktcap_crsp_test = mktcap_crsp[0:100000] #
# combination_counts = mktcap_crsp_test.drop_duplicates(subset=['date', 'permco']).shape[0] #
# print(combination_counts) #
###################

### Market Cap with Compustat daily securities
- out of all types of securities, keep only US listed common/ordinary shares

In [None]:
mktcap_comp = conn.raw_sql("""
                        SELECT DISTINCT 
                        datadate AS date, 
                        gvkey, 
                        SUM(prccd*cshoc)/1000000 AS tot_mrkcap_COMP
                        
                        FROM comp.secd
                        
                        WHERE EXTRACT(MONTH FROM datadate) IN (5,6) 
                        AND datadate >= '1999-01-01' 
                        AND curcdd = 'USD'
                        AND tpci in ('0')
                        
                        GROUP BY gvkey, datadate
                        """)
mktcap_comp['date'] = pd.to_datetime(mktcap_comp['date'])

In [None]:
print(mktcap_comp.loc[mktcap_comp['permco'] == '1221'])

In [49]:
print(mktcap_comp.loc[mktcap_comp['gvkey'] == '133764'])

              date   gvkey  tot_mrkcap_comp
4107    2007-05-16  133764         34.82850
21033   2009-06-11  133764         30.10000
31838   2019-05-14  133764        107.97144
32610   2021-06-08  133764        167.00375
47987   2020-06-12  133764        118.65060
...            ...     ...              ...
429686  2006-06-20  133764         34.91670
432997  2018-06-22  133764        105.37800
447440  2008-05-09  133764         31.80100
450562  2024-06-13  133764        104.78219
458337  2015-05-15  133764         41.97654

[1062 rows x 3 columns]


In [17]:
mktcap_comp_resample = mktcap_comp
mktcap_comp_resample.set_index('date', inplace=True)
mktcap_comp_monthly = mktcap_comp_resample.groupby('gvkey').resample('ME', include_groups=False).mean().reset_index()
mktcap_comp_monthly = mktcap_comp_monthly.dropna()

              date   gvkey tot_mrkcap_comp
0       2020-06-03  013679           $6.31
2789    2023-06-15  013679          $21.27
11126   2024-05-09  013679           $7.01
43312   2015-06-16  013679         $105.07
68066   2021-05-26  013679          $30.16
...            ...     ...             ...
380379  2022-06-24  013679          $32.73
385226  2018-05-17  013679          $55.93
388535  2017-05-04  013679          $62.18
410672  2020-05-27  013679           $5.84
414463  2024-05-16  013679           $5.84

[508 rows x 3 columns]


### Compute MKRT Cap of the non-listed share classes
- find and clean data for traded classes from two dataset - fundq and secm(quarterly macro data and monthly securities data)
- find non-listed ones and merge

In [92]:
cshoq_1 = conn.raw_sql("""
                        SELECT 
                        a.gvkey, a.datadate, a.cshoq, a.fyr, 
                        b.iid, b.ajexm AS ajexq
                        
                        FROM comp.fundq AS a
                        
                        LEFT JOIN comp.secm AS b
                        
                        ON a.gvkey = b.gvkey
                        AND DATE_TRUNC('month', a.datadate) + INTERVAL '1 MONTH - 1 day' = DATE_TRUNC('month', b.datadate) + INTERVAL '1 MONTH - 1 day'
                        
                        WHERE a.cshoq IS NOT NULL
                        """)

cshoq_1['datadate'] = pd.to_datetime(cshoq_1['datadate'])

#select distinct
cshoq_m = cshoq_1.drop_duplicates(subset=['gvkey','iid' , 'datadate', 'cshoq', 'ajexq'])

#keep max cshoq
cshoq_sort = cshoq_m.sort_values(by=['gvkey', 'iid', 'datadate']).reset_index().drop('index', axis=1)
cshoq_group = cshoq_sort.groupby(['gvkey', 'iid', 'datadate'])
idx_max = cshoq_group['cshoq'].idxmax()
cshoq_max = cshoq_sort.loc[idx_max].reset_index(drop=True)

#unique_iids = cshoq_max['iid'].unique()
#print(unique_iids)

In [None]:
#populate forward(fill missing monthly data)
cshoq_max = cshoq_max.sort_values(by=['gvkey', 'iid', 'datadate'])

#cshoq_max['cshoq'] = cshoq_max.groupby(['gvkey', 'iid'])['cshoq'].transform(lambda x: x.ffill(limit=12))

cshoq_max.set_index('datadate', inplace=True)

# Perform forward fill within each group/Upsample to monthly frequency
cshoq_ffill = cshoq_max.groupby(['gvkey', 'iid']).resample('ME', include_groups=False).ffill().reset_index()

print(cshoq_ffill)


In [205]:
secm_1 = conn.raw_sql("""
                        SELECT DISTINCT
                        gvkey,
                        iid,
                        datadate,  
                        prccm,
                        iid,
                        ajexm,
                        cshom
                        
                        FROM comp.secm
                        
                        WHERE datadate >= '1999-01-01' 
                        AND curcdm = 'USD'
                        AND tpci IN ('0')
                        
                        """)

In [249]:
'''
secm_1_test = conn.raw_sql("""
                            SELECT DISTINCT
                            gvkey,
                            iid,
                            datadate,  
                            prccm,
                            ajexm,
                            cshom

                            FROM comp.secm

                            WHERE datadate >= '1999-01-01' 
                            AND curcdm = 'USD'
                            AND tpci IN ('0')
                            AND gvkey IN ('023523','100001')


                            """)
secm_1_test['datadate'] = pd.to_datetime(secm_1_test['datadate'])
df_secm1_test_sort = secm_1_test.sort_values(by=['gvkey', 'datadate']).reset_index().drop('index', axis=1)
'''

'\nsecm_1_test = conn.raw_sql("""\n                            SELECT DISTINCT\n                            gvkey,\n                            iid,\n                            datadate,  \n                            prccm,\n                            iid,\n                            ajexm,\n                            cshom\n\n                            FROM comp.secm\n\n                            WHERE datadate >= \'1999-01-01\' \n                            AND curcdm = \'USD\'\n                            AND tpci IN (\'0\')\n                            AND gvkey IN (\'023523\',\'100001\')\n\n\n                            """)\nsecm_1_test[\'datadate\'] = pd.to_datetime(secm_1_test[\'datadate\'])\ndf_secm1_test_sort = secm_1_test.sort_values(by=[\'gvkey\', \'datadate\']).reset_index().drop(\'index\', axis=1)\n'

In [250]:
#sort value need to apply 'drop index' in order to process the dataframe in correct order

In [251]:
# Define a function to fill NaNs only if preceding and succeeding values are the same
def fill_if_surrounded_by_same(series):
    filled = series.copy()
    for i in range(1, len(series) - 1):
        if pd.isna(series[i]) and series[i-1] == series[i+1]:
            filled[i] = series[i-1]
    return filled

DataFrame with conditional filling:
      gvkey iid   datadate   prccm iid  ajexm       cshom
252  023523  01 2021-05-31  0.3500  01    1.0  46415000.0
253  023523  01 2021-06-30  0.6045  01    1.0  46415000.0
254  023523  01 2021-07-31  0.1550  01    1.0  46415000.0
255  023523  01 2021-08-31  0.1150  01    1.0  46415000.0
      gvkey iid   datadate    prccm iid  ajexm        cshom
290  023523  01 2024-07-31   0.0101  01    1.0   46415000.0
291  100001  01 2003-07-31  37.3115  01    1.0          0.0
292  100001  01 2003-08-31  38.0870  01    1.0          0.0
293  100001  01 2003-09-30  36.6282  01    1.0          0.0
294  100001  01 2003-10-31  37.9303  01    1.0          0.0
295  100001  01 2003-11-30  41.1200  01    1.0          0.0
296  100001  01 2003-12-31  44.1840  01    1.0          0.0
297  100001  01 2004-01-31  46.2500  01    1.0          0.0
298  100001  01 2022-02-28  35.0100  01    1.0  261856000.0
299  100001  01 2022-03-31   0.0000  01    0.0  261856000.0


In [None]:
# Apply the function to the DataFrame column
df_secm1_test_sort['cshom'] = fill_if_surrounded_by_same(df_secm1_test_sort['cshom'])
df_secm1_test_sort.fillna(0, inplace=True)

print (df_secm1_test_sort.iloc[252:256, :])
print (df_secm1_test_sort.iloc[290:300, :])

In [254]:
secm_1['datadate'] = pd.to_datetime(secm_1['datadate'])
secm_1['cshom'] /= 1_000_000
secm_1['mktcap_COMP'] = secm_1['cshom'] * secm_1['prccm']

secm_1['date'] = secm_1['datadate'] + pd.offsets.MonthEnd(0)

#different_rows = secm_1_test[secm_1_test['datadate'] != secm_1_test['date']]
#print(different_rows) returns: none  -- dates transfer might be unnecessary

In [276]:
secm_1_test = secm_1[0:100000]

#dealing NaNs 
df_secm1_test_sort = secm_1_test.sort_values(by=['gvkey', 'datadate']).reset_index().drop('index', axis=1)
df_secm1_test_sort['cshom'] = fill_if_surrounded_by_same(df_secm1_test_sort['cshom'])
df_secm1_test_sort.fillna(0, inplace=True)

df_secm1_groups = df_secm1_test_sort.groupby(['gvkey', 'datadate'])

In [253]:
'''
specific_row = secm_1_test[(secm_1_test['gvkey'] == '011556') & (secm_1_test['datadate'] == '2002-06-30')]
test = ['348892','160709','023523','100001']

filtered = df_secm1_groups.filter(lambda x: x['gvkey'].iloc[0] in test)
specific_row = df_secm1_groups.loc[secm_1_test['gvkey'].isin(test)]
specific_1 = secm_1_test.loc[secm_1_test['gvkey'] == '100001']
na_in_A = secm_1_test[secm_1_test['ajexm'].isna()]

print(filtered)
'''

"\nspecific_row = secm_1_test[(secm_1_test['gvkey'] == '011556') & (secm_1_test['datadate'] == '2002-06-30')]\ntest = ['348892','160709','023523','100001']\n\nfiltered = df_secm1_groups.filter(lambda x: x['gvkey'].iloc[0] in test)\nspecific_row = df_secm1_groups.loc[secm_1_test['gvkey'].isin(test)]\nspecific_1 = secm_1_test.loc[secm_1_test['gvkey'] == '100001']\nna_in_A = secm_1_test[secm_1_test['ajexm'].isna()]\n\nprint(filtered)\n"

In [277]:
df_secm2_out = pd.DataFrame()
for idx, df_group in df_secm1_groups:
    if (len(df_group)>1):
        print (idx)
#        break
        tot_cshom = df_group.cshom.sum()
        tot_mktcap_COMP = df_group.mktcap_COMP.sum()
        if sum(df_group.mktcap_COMP*abs(df_group.prccm)>0):
        #if (sum(df_group.mktcap_COMP*abs(df_group.prccm))>0):
            print(sum(df_group.mktcap_COMP*df_group.prccm), sum(df_group.mktcap_COMP*abs(df_group.prccm)))
            prccm_vw= sum(df_group.mktcap_COMP*df_group.prccm)/sum(df_group.mktcap_COMP*abs(df_group.prccm))

            df_group.loc[:, "prccm_vw"] = prccm_vw
        else:
            #continue
            df_group.loc[:, "prccm_vw"] = 0
        df_group.loc[:, "tot_cshom"] = tot_cshom
        df_group.loc[:, "tot_mktcap_COMP"] = tot_mktcap_COMP

    #out.append([tot_cshom, tot_mktcap_COMP, prccm_vw, idx[0], idx[1]])
    #df_out = pd.DataFrame(data = out, columns = ['tot_cshom', 'tot_mktcap_COMP', 'prccm_vw', 'gvkey', 'date'])
                          #{'tot_cshom':tot_cshom, 'tot_mktcap_COMP': tot_mktcap_COMP, 'prccm_vw': prccm_vw, 'gvkey': idx[0], 
                          #      'date': idx[1]}, index=None) #, 'iid': df_group.iid.unique()[0], 'ajexm': df_group.ajexm.unique()[0]})
    #df_group = df_group.fillna(0)
    
        df_secm2_out = pd.concat([df_secm2_out, df_group])
    #break
df_secm2_out

('001823', Timestamp('2015-08-31 00:00:00'))
58888.865605599996 58888.865605599996
('001823', Timestamp('2019-10-31 00:00:00'))
27393.5784776 27393.5784776
('002150', Timestamp('2008-03-31 00:00:00'))
11557.0935233 11557.0935233
('002710', Timestamp('2011-06-30 00:00:00'))
93052.5733116 93052.5733116
('004842', Timestamp('1999-04-30 00:00:00'))
18340.031089843753 18340.031089843753
('005567', Timestamp('2022-03-31 00:00:00'))
2593679.2159977998 2593679.2159977998
('005567', Timestamp('2023-01-31 00:00:00'))
3060877.6803399 3060877.6803399
('005763', Timestamp('2015-08-31 00:00:00'))
11066.6745719 11066.6745719
('006116', Timestamp('2013-09-30 00:00:00'))
50678.07568749999 50678.07568749999
('006379', Timestamp('2014-01-31 00:00:00'))
19495.5946812 19495.5946812
('006379', Timestamp('2019-10-31 00:00:00'))
22958.608102500002 22958.608102500002
('006710', Timestamp('2015-04-30 00:00:00'))
0.00510598802 0.00510598802
('008434', Timestamp('2000-02-29 00:00:00'))
4746.8509375 4746.8509375
(

Unnamed: 0,gvkey,iid,datadate,prccm,iid.1,ajexm,cshom,mktcap_COMP,date,prccm_vw,tot_cshom,tot_mktcap_COMP
1054,001823,01,2015-08-31,51.6600,01,1.000000,17.271,892.219860,2015-08-31,1.0,22.126,1141.475560
1055,001823,02,2015-08-31,51.3400,02,1.520875,4.855,249.255700,2015-08-31,1.0,22.126,1141.475560
1058,001823,02,2019-10-31,29.0000,02,1.000000,7.909,229.361000,2019-10-31,1.0,32.675,946.089040
1059,001823,01,2019-10-31,28.9400,01,1.000000,24.766,716.728040,2019-10-31,1.0,32.675,946.089040
1548,002150,02,2008-03-31,11.0000,02,1.000000,14.243,156.673000,2008-03-31,1.0,102.260,1087.012690
...,...,...,...,...,...,...,...,...,...,...,...,...
82040,160329,03,2016-01-31,742.9500,03,20.000000,345.504,256692.196800,2016-01-31,1.0,636.832,478494.769600
82268,160549,02,2012-10-31,61.0300,02,1.000000,10.207,622.933210,2012-10-31,1.0,154.695,9296.547850
82269,160549,01,2012-10-31,60.0300,01,1.000000,144.488,8673.614640,2012-10-31,1.0,154.695,9296.547850
89169,176065,01,2009-06-30,1.6699,01,1.000000,13.113,21.897399,2009-06-30,1.0,15.394,25.889149


In [271]:
values = [0, 1]
filtered_rows = df_secm2_out[~df_secm2_out['prccm_vw'].isin(values)]
print(filtered_rows)

Empty DataFrame
Columns: [gvkey, iid, datadate, prccm, iid, ajexm, cshom, mktcap_COMP, date, prccm_vw, tot_cshom, tot_mktcap_COMP]
Index: []


In [269]:
filtered_rows = df_secm2_out.loc[df_secm2_out['prccm_vw'] != 1]
print(filtered_rows)

       gvkey iid   datadate     prccm iid  ajexm   cshom  mktcap_COMP  \
28    001262  01 2018-02-28   12.3238  01    1.0    0.00          0.0   
40    001332  01 2004-12-31    0.0001  01    1.0    0.00          0.0   
114   001950  01 2007-11-30    0.0001  01    1.0    0.00          0.0   
115   001953  01 2005-05-31    0.0001  01    1.0    0.00          0.0   
143   002158  01 2003-12-31    0.0001  01    1.0    0.00          0.0   
...      ...  ..        ...       ...  ..    ...     ...          ...   
9978  316895  01 2017-04-30   24.7590  01    1.0    0.00          0.0   
9979  318434  01 2021-06-30  119.5100  01    1.0    0.00          0.0   
9985  327589  01 2020-10-31    2.7000  01    1.0    0.00          0.0   
9986  327675  01 2022-06-30   12.0522  01    1.0    0.00          0.0   
9997  348892  01 2023-02-28    0.0000  01    0.0  126.53          0.0   

           date  prccm_vw  tot_cshom  tot_mktcap_COMP  
28   2018-02-28       0.0       0.00              0.0  
40   2004-1

In [262]:
#zero_count = len(df_secm2_out[df_secm2_out['prccm_vw'] == 0])
#print(f"Number of zeros in the column: {zero_count}")

Number of zeros in the column: 780
