In [1]:
import pandas as pd
import numpy as np

In [2]:
# Define paths and file names 
target_path = 'Targets/daily_crsp.csv'

firm_charac_path = 'Predictors/CompFirmCharac.csv'
jkp_path = 'Predictors/jkp.csv'
#earnings_path = 'Predictors/earnings_calls.parquet'
#mda_path = 'Predictors/mda_text.parquet'

link_table_path = 'linking_table.csv'

# First date for which we have daily returns
CUTOFF_DATE = '2000-01-03'


In [3]:
def import_sanitize_daily_returns(target_path, nrows=None):
    """
    Import and sanitize the data frame of daily returns.

    Parameters:
     - target_path: the path to the CSV file containing the daily returns 
     - nrows: number of rows to load from the file. None if the whole dataset is to be loaded.
    """
    # Define only the necessary columns to import
    required_columns = ['PERMNO', 'PERMCO', 'DlyCalDt', 'SICCD', 'NAICS', 'DlyRet', 'sprtrn']

    # Load only the required columns
    daily_data = pd.read_csv(target_path, usecols=required_columns, nrows=nrows)

    # Drop rows with missing key data
    daily_data = daily_data.dropna(subset=['PERMNO', 'DlyCalDt', 'DlyRet'])

    # Convert dates
    daily_data['DlyCalDt'] = pd.to_datetime(daily_data['DlyCalDt'], errors='coerce')

    # Rename date column
    daily_data = daily_data.rename(columns={'DlyCalDt': 'date'})

    # Sort and deduplicate
    daily_data = daily_data.sort_values(by=['date', 'PERMCO', 'PERMNO'])
    daily_data = daily_data.drop_duplicates(subset=['PERMNO', 'date'])

    # Drop identifier columns 
    #drop_cols = ['CUSIP', 'HdrCUSIP', 'TradingSymbol', 'Ticker']
    #drop_cols = [col for col in drop_cols if col in daily_data.columns]
    #daily_data = daily_data.drop(columns=drop_cols)

    # Set multi-index
    daily_data = daily_data.set_index(['date', 'PERMCO', 'PERMNO'])

    return daily_data


In [4]:
daily_data = import_sanitize_daily_returns(target_path, nrows=None)

In [5]:
daily_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SICCD,NAICS,DlyRet,sprtrn
date,PERMCO,PERMNO,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03,5,15580,6320,0,0.000000,-0.009549
2000-01-03,7,14593,3573,0,0.088754,-0.009549
2000-01-03,25,62770,6711,0,-0.061489,-0.009549
2000-01-03,29,59184,2082,0,-0.012346,-0.009549
2000-01-03,33,59248,2082,0,-0.034524,-0.009549
...,...,...,...,...,...,...
2024-12-31,60118,26023,9999,0,-0.000393,-0.004285
2024-12-31,60120,26027,9999,523910,-0.194924,-0.004285
2024-12-31,60121,26034,9999,0,0.108590,-0.004285
2024-12-31,60122,26035,9999,551112,-0.034979,-0.004285


In [6]:
daily_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 46688527 entries, (Timestamp('2000-01-03 00:00:00'), np.int64(5), np.int64(15580)) to (Timestamp('2024-12-31 00:00:00'), np.int64(60123), np.int64(26037))
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   SICCD   int64  
 1   NAICS   int64  
 2   DlyRet  float64
 3   sprtrn  float64
dtypes: float64(2), int64(2)
memory usage: 1.7 GB


In [7]:
daily_data.describe()

Unnamed: 0,SICCD,NAICS,DlyRet,sprtrn
count,46688530.0,46688530.0,46688530.0,46688530.0
mean,5778.286,372586.9,0.0004924532,0.000292751
std,2216.071,206537.6,0.04287394,0.01218781
min,0.0,0.0,-1.0,-0.119841
25%,3820.0,311920.0,-0.010952,-0.004835
50%,6360.0,511210.0,0.0,0.000589
75%,6726.0,525990.0,0.010548,0.006002
max,9999.0,999990.0,39.7253,0.1158


In [3]:
import os

def save_sanitized_csv(df, original_path, date_format='%Y-%m-%d'):
    """
    Save the sanitized DataFrame to a new CSV with '_sanitized' appended to the original filename.

    Parameters:
     - df: The sanitized DataFrame (with a datetime index).
     - original_path: The original CSV file path.
     - date_format: Format for datetime values in the output file.
    """
    # Get directory, filename, and extension
    base, ext = os.path.splitext(original_path)
    new_path = f"{base}_sanitized{ext}"

    # Reset index to save multi-index as columns
    df_to_save = df.reset_index()

    # Write to CSV with datetime formatting
    df_to_save.to_csv(new_path, index=False, date_format=date_format)

    print(f"Sanitized data saved to: {new_path}")


In [10]:
save_sanitized_csv(daily_data, target_path)

Sanitized data saved to: Targets/daily_crsp_sanitized.csv


In [4]:
def import_sanitize_firm_charac(
    firm_charac_path, 
    nrows=None, 
    missing_threshold=0.5, 
    required_unique=500, 
    cutoff_date='1925-05-30'
):
    """
    Imports and sanitizes the dataset containing firm characteristics.

    Parameters:
     - firm_charac_path: path to CSV file containing the data
     - nrows: number of rows to load in memory (None if the whole dataset is to be loaded)
     - missing_threshold: if a column contains more than missing_threshold*100% missing values, it is dropped
     - required_unique: if a column has fewer than required_unique unique values, it is dropped
     - cutoff_date: data starts from this date (inclusive)
    """
    # Load data
    comp = pd.read_csv(firm_charac_path, nrows=nrows, low_memory=False)

    # Convert datadate to datetime
    comp['datadate'] = pd.to_datetime(comp['datadate'], errors='coerce')

    # Filter by date
    cutoff = pd.to_datetime(cutoff_date)
    comp = comp[comp['datadate'] >= cutoff]

    # Drop rows where gvkey or datadate is missing
    comp = comp.dropna(subset=['gvkey', 'datadate'])

    # Drop duplicate (gvkey, datadate) pairs
    comp = comp.drop_duplicates(subset=['gvkey', 'datadate'])

    # Rename datadate to date for consistency
    comp = comp.rename(columns={'datadate': 'date'})

    # Sort data frame and set multi-index
    comp = comp.sort_values(by=['date', 'gvkey'])
    comp = comp.set_index(['date', 'gvkey'])

    # Drop identifier columns
    identifiers = ['cusip', 'tic', 'conm', 'exchg', 'cik', 'costat', 'fic']
    comp = comp.drop(columns=[col for col in identifiers if col in comp.columns])

    # Drop columns with too many missing values
    valid_cols = comp.columns[comp.isna().mean() < missing_threshold]
    comp = comp[valid_cols]

    # Drop columns with too few unique values
    comp = comp.loc[:, comp.nunique(dropna=True) >= required_unique]

    return comp


In [5]:
firms = import_sanitize_firm_charac(firm_charac_path, cutoff_date=CUTOFF_DATE)

In [6]:
firms

Unnamed: 0_level_0,Unnamed: 1_level_0,acchgy,aolochy,aqcy,capxy,chechy,cibegniy,cicurry,cidergly,ciothery,cipeny,...,txbcoy,txdcy,txty,txwy,xidocy,xidoy,xinty,xiy,xopry,xsgay
date,gvkey,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2000-01-31,1013,0.0,-4.824,17.963,46.785,21.138,,,,,,...,,,15.300,,0.000,0.000,,0.0,496.900,205.600
2000-01-31,1082,0.0,-1.734,0.000,9.597,-2.956,,,,,,...,,1.061,0.590,,2.975,-2.563,4.084,0.0,123.541,6.468
2000-01-31,1173,0.0,0.882,0.000,0.558,-0.754,,,,,,...,,-0.073,0.222,,0.000,0.000,0.416,0.0,21.847,7.457
2000-01-31,1183,0.0,-5.582,0.000,1.091,22.224,,,,,,...,,0.000,-3.286,,0.741,0.741,,0.0,9.732,
2000-01-31,1189,0.0,-64.102,0.888,17.181,-12.630,,,,,,...,,6.349,12.136,,-0.011,-0.007,8.095,0.0,724.161,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-31,323463,,,,,,,,,,,...,,,,,,,,,,
2025-03-31,328087,,,,,,,,,,,...,,,,,,,,,,
2025-03-31,330942,,,,,,,,,,,...,,,,,,,,,,
2025-03-31,354003,,,,,,,,,,,...,,,,,,,,,,


In [7]:
firms.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1159055 entries, (Timestamp('2000-01-31 00:00:00'), np.int64(1013)) to (Timestamp('2025-04-30 00:00:00'), np.int64(35223))
Data columns (total 76 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   acchgy    826990 non-null  float64
 1   aolochy   805564 non-null  float64
 2   aqcy      802393 non-null  float64
 3   capxy     817853 non-null  float64
 4   chechy    824157 non-null  float64
 5   cibegniy  625008 non-null  float64
 6   cicurry   619690 non-null  float64
 7   cidergly  619362 non-null  float64
 8   ciothery  624483 non-null  float64
 9   cipeny    621670 non-null  float64
 10  cisecgly  619243 non-null  float64
 11  citotaly  626048 non-null  float64
 12  cogsy     874979 non-null  float64
 13  cshfdy    882323 non-null  float64
 14  cshpry    888269 non-null  float64
 15  cstkey    888560 non-null  float64
 16  dilady    856340 non-null  float64
 17  dilavy    856136 non-null  fl

In [8]:
save_sanitized_csv(firms, firm_charac_path)

Sanitized data saved to: Predictors/CompFirmCharac_sanitized.csv


In [4]:
def import_sanitize_jkp(jkp_path, nrows=None, cutoff_date='1925-05-30'):
    """
    Import and sanitize the factors. This function returns a data frame indexed by date
    containing only the returns of the factors. Columns location, frequency, weighting, 
    direction, n_stocks and n_stocks_min are dropped in the process because they are 
    uninformative. 
    Parameters: 
     - jkp_path: path to csv file containing the jkp factors. 
     - nrows: number of rows to load in memory. All of the data is loaded when nrows=None.
     - cutoff_date: the data starts from this date
    """
    jkp = pd.read_csv(jkp_path, nrows = 100000)
    
    jkp['date'] = pd.to_datetime(jkp['date'])    # Parse date column
    cutoff = pd.to_datetime(cutoff_date)
    jkp = jkp[jkp['date'] >= cutoff]        # Remove observations before 2000 as this is irrelevant for us

    # Pivot to wide format: one column per factor
    jkp_wide = jkp.pivot(index='date', columns='name', values='ret')

    return jkp_wide


In [5]:
jkp = import_sanitize_jkp(jkp_path, cutoff_date = CUTOFF_DATE)

In [6]:
jkp

name,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,at_turnover,be_gr1a,be_me,...,opex_at,pi_nix,ppeinv_gr1a,prc,prc_highprc_252d,qmj,qmj_growth,qmj_prof,qmj_safety,rd5_at
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31,0.030254,-0.015875,0.018536,0.058705,0.048212,0.008562,-0.046928,-0.040382,0.011158,-0.037480,...,-0.025649,-0.037852,-0.008723,0.029981,0.004210,-0.040257,-0.035010,-0.057493,-0.000004,0.139700
2000-02-29,0.259166,-0.222644,0.138305,0.126145,0.302350,-0.107640,-0.347456,-0.003528,-0.120481,-0.292134,...,0.038776,-0.058175,-0.108115,-0.078144,0.142095,-0.009524,-0.036718,-0.052376,0.089605,0.399325
2000-03-31,-0.211559,0.113484,-0.059384,-0.097385,-0.161972,0.060418,0.241058,0.068863,0.021303,0.204914,...,-0.006270,0.034848,0.044282,0.088231,-0.158170,0.031144,0.013444,0.126156,-0.039629,-0.217580
2000-04-30,-0.225892,0.130952,-0.035645,-0.057125,-0.124413,0.092468,0.199574,0.070121,0.057028,0.170982,...,-0.001731,0.007230,0.073980,0.034441,0.053232,0.013586,0.014560,0.121457,-0.041360,-0.073432
2000-05-31,-0.136842,0.138348,-0.028966,-0.031415,-0.114599,0.114336,0.151526,-0.020165,0.102710,0.124502,...,-0.029297,0.014042,0.080366,0.017054,0.069763,-0.001198,0.005664,0.060433,-0.032958,-0.093934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-31,-0.009159,0.010798,0.018197,-0.035742,-0.027581,-0.009432,-0.012035,-0.008847,0.004118,-0.020158,...,-0.009840,-0.005098,0.009275,-0.027720,0.035217,0.010192,-0.005674,0.014741,0.012970,0.009868
2024-09-30,0.015388,0.005373,0.007651,-0.009591,-0.007449,0.003045,-0.009871,0.003660,0.008696,-0.019773,...,0.005259,0.006355,0.008296,-0.006779,0.003206,-0.006422,-0.002243,0.002634,-0.018560,-0.017818
2024-10-31,0.030176,-0.017033,-0.009476,0.002699,-0.020468,-0.015150,0.006831,-0.030677,0.002992,-0.004121,...,-0.029799,0.003082,0.002214,0.012406,0.021771,-0.006040,0.002241,-0.015539,0.002140,0.024123
2024-11-30,0.055421,-0.058225,-0.021419,0.017654,-0.017568,-0.029310,0.003207,-0.008216,-0.014642,-0.015056,...,0.004748,0.001879,0.006882,0.007938,0.016876,-0.005569,0.011715,-0.017477,-0.023964,0.019916


In [7]:
save_sanitized_csv(jkp, jkp_path)

Sanitized data saved to: Predictors/jkp_sanitized.csv


In [8]:
def import_sanitize_linking_table(link_table_path):
    ccm = pd.read_csv(link_table_path)

    # Convert start date
    ccm['LINKDT'] = pd.to_datetime(ccm['LINKDT'])

    # Replace 'E' with a placeholder date, then convert and fill missing values
    ccm['LINKENDDT'] = ccm['LINKENDDT'].replace('E', '2099-12-31')
    ccm['LINKENDDT'] = pd.to_datetime(ccm['LINKENDDT'], errors='coerce')
    ccm['LINKENDDT'] = ccm['LINKENDDT'].fillna(pd.to_datetime('2099-12-31'))

    # Rename columns for merge compatibility
    ccm.rename(columns={'GVKEY': 'gvkey', 'LPERMNO': 'PERMNO', 'LPERMCO': 'PERMCO'}, inplace=True)

    ccm = ccm.set_index('gvkey')

    return ccm


In [9]:
links = import_sanitize_linking_table(link_table_path)

In [10]:
links

Unnamed: 0_level_0,LINKTYPE,PERMNO,PERMCO,LINKDT,LINKENDDT
gvkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,LU,25881,23369,1970-11-13,1978-06-30
1001,LU,10015,6398,1983-09-20,1986-07-31
1002,LC,10023,22159,1972-12-14,1973-06-05
1003,LU,10031,6672,1983-12-07,1989-08-16
1004,LU,54594,20000,1972-04-24,2099-12-31
...,...,...,...,...,...
352262,LC,23773,59507,2023-03-17,2099-12-31
353444,LC,23209,59330,2022-07-22,2099-12-31
355398,LC,25134,59886,2024-05-17,2099-12-31
356128,LC,24704,59765,2024-01-19,2099-12-31


In [11]:
save_sanitized_csv(links, link_table_path)

Sanitized data saved to: linking_table_sanitized.csv
